diff --git a/_quarto.yml b/_quarto.yml
index 4aeef501..04181d5c 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -39,7 +39,7 @@ book:
         - sql_I/sql_I.qmd
         - sql_II/sql_II.qmd
         - logistic_regression_1/logistic_reg_1.qmd
-        # - logistic_regression_2/logistic_reg_2.qmd
+        - logistic_regression_2/logistic_reg_2.qmd
         # - pca_1/pca_1.qmd
         # - pca_2/pca_2.qmd
         # - clustering/clustering.qmd
diff --git a/docs/case_study_HCE/case_study_HCE.html b/docs/case_study_HCE/case_study_HCE.html
index b5cc2941..07d9a3d3 100644
--- a/docs/case_study_HCE/case_study_HCE.html
+++ b/docs/case_study_HCE/case_study_HCE.html
@@ -278,6 +278,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/constant_model_loss_transformations/loss_transformations.html b/docs/constant_model_loss_transformations/loss_transformations.html
index a53a5377..56c9c6a6 100644
--- a/docs/constant_model_loss_transformations/loss_transformations.html
+++ b/docs/constant_model_loss_transformations/loss_transformations.html
@@ -280,6 +280,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/cv_regularization/cv_reg.html b/docs/cv_regularization/cv_reg.html
index 0f40966e..bc9cb896 100644
--- a/docs/cv_regularization/cv_reg.html
+++ b/docs/cv_regularization/cv_reg.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/eda/eda.html b/docs/eda/eda.html
index 56c64c59..a8b412b3 100644
--- a/docs/eda/eda.html
+++ b/docs/eda/eda.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/feature_engineering/feature_engineering.html b/docs/feature_engineering/feature_engineering.html
index 22d26788..60ec8fa7 100644
--- a/docs/feature_engineering/feature_engineering.html
+++ b/docs/feature_engineering/feature_engineering.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/gradient_descent/gradient_descent.html b/docs/gradient_descent/gradient_descent.html
index ed238d2c..9a2c5f4e 100644
--- a/docs/gradient_descent/gradient_descent.html
+++ b/docs/gradient_descent/gradient_descent.html
@@ -299,6 +299,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/index.html b/docs/index.html
index 937e421a..63113a69 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -251,6 +251,12 @@
   <a href="./logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/inference_causality/inference_causality.html b/docs/inference_causality/inference_causality.html
index 240ddea9..67584dca 100644
--- a/docs/inference_causality/inference_causality.html
+++ b/docs/inference_causality/inference_causality.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/intro_lec/introduction.html b/docs/intro_lec/introduction.html
index 1df54fa6..7b9b5c0a 100644
--- a/docs/intro_lec/introduction.html
+++ b/docs/intro_lec/introduction.html
@@ -244,6 +244,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/intro_to_modeling/intro_to_modeling.html b/docs/intro_to_modeling/intro_to_modeling.html
index 9e5919e4..f8310f1f 100644
--- a/docs/intro_to_modeling/intro_to_modeling.html
+++ b/docs/intro_to_modeling/intro_to_modeling.html
@@ -280,6 +280,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/logistic_regression_1/logistic_reg_1.html b/docs/logistic_regression_1/logistic_reg_1.html
index fece397d..e30ea8cb 100644
--- a/docs/logistic_regression_1/logistic_reg_1.html
+++ b/docs/logistic_regression_1/logistic_reg_1.html
@@ -64,6 +64,7 @@
 <script src="../site_libs/quarto-search/fuse.min.js"></script>
 <script src="../site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="../">
+<link href="../logistic_regression_2/logistic_reg_2.html" rel="next">
 <link href="../sql_II/sql_II.html" rel="prev">
 <link href="../data100_logo.png" rel="icon" type="image/png">
 <script src="../site_libs/quarto-html/quarto.js"></script>
@@ -282,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
@@ -1197,6 +1204,9 @@ <h3 data-number="22.4.2" class="anchored" data-anchor-id="likelihood-of-data"><s
       </a>          
   </div>
   <div class="nav-page nav-page-next">
+      <a href="../logistic_regression_2/logistic_reg_2.html" class="pagination-link">
+        <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
   </div>
 </nav>
 </div> <!-- /content -->
diff --git a/docs/logistic_regression_2/images/confusion_matrix.png b/docs/logistic_regression_2/images/confusion_matrix.png
new file mode 100644
index 00000000..75fff830
Binary files /dev/null and b/docs/logistic_regression_2/images/confusion_matrix.png differ
diff --git a/docs/logistic_regression_2/images/confusion_matrix_sklearn.png b/docs/logistic_regression_2/images/confusion_matrix_sklearn.png
new file mode 100644
index 00000000..8126cd8d
Binary files /dev/null and b/docs/logistic_regression_2/images/confusion_matrix_sklearn.png differ
diff --git a/docs/logistic_regression_2/images/decision_boundary.png b/docs/logistic_regression_2/images/decision_boundary.png
new file mode 100644
index 00000000..df94c58e
Binary files /dev/null and b/docs/logistic_regression_2/images/decision_boundary.png differ
diff --git a/docs/logistic_regression_2/images/decision_boundary_true.png b/docs/logistic_regression_2/images/decision_boundary_true.png
new file mode 100644
index 00000000..d3b39b6d
Binary files /dev/null and b/docs/logistic_regression_2/images/decision_boundary_true.png differ
diff --git a/docs/logistic_regression_2/images/linear_separability_1D.png b/docs/logistic_regression_2/images/linear_separability_1D.png
new file mode 100644
index 00000000..98586398
Binary files /dev/null and b/docs/logistic_regression_2/images/linear_separability_1D.png differ
diff --git a/docs/logistic_regression_2/images/linear_separability_2D.png b/docs/logistic_regression_2/images/linear_separability_2D.png
new file mode 100644
index 00000000..6b7af88c
Binary files /dev/null and b/docs/logistic_regression_2/images/linear_separability_2D.png differ
diff --git a/docs/logistic_regression_2/images/log_reg_summary.png b/docs/logistic_regression_2/images/log_reg_summary.png
new file mode 100644
index 00000000..7c0ec940
Binary files /dev/null and b/docs/logistic_regression_2/images/log_reg_summary.png differ
diff --git a/docs/logistic_regression_2/images/pr_curve_perfect.png b/docs/logistic_regression_2/images/pr_curve_perfect.png
new file mode 100644
index 00000000..cfb5f2d9
Binary files /dev/null and b/docs/logistic_regression_2/images/pr_curve_perfect.png differ
diff --git a/docs/logistic_regression_2/images/pr_curve_thresholds.png b/docs/logistic_regression_2/images/pr_curve_thresholds.png
new file mode 100644
index 00000000..c01f478d
Binary files /dev/null and b/docs/logistic_regression_2/images/pr_curve_thresholds.png differ
diff --git a/docs/logistic_regression_2/images/precision-recall-thresh.png b/docs/logistic_regression_2/images/precision-recall-thresh.png
new file mode 100644
index 00000000..c1dc555a
Binary files /dev/null and b/docs/logistic_regression_2/images/precision-recall-thresh.png differ
diff --git a/docs/logistic_regression_2/images/precision_recall_graphic.png b/docs/logistic_regression_2/images/precision_recall_graphic.png
new file mode 100644
index 00000000..241c8fc4
Binary files /dev/null and b/docs/logistic_regression_2/images/precision_recall_graphic.png differ
diff --git a/docs/logistic_regression_2/images/reg_loss.png b/docs/logistic_regression_2/images/reg_loss.png
new file mode 100644
index 00000000..b461c0a4
Binary files /dev/null and b/docs/logistic_regression_2/images/reg_loss.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve.png b/docs/logistic_regression_2/images/roc_curve.png
new file mode 100644
index 00000000..273b0b55
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve_perfect.png b/docs/logistic_regression_2/images/roc_curve_perfect.png
new file mode 100644
index 00000000..42a9d848
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve_perfect.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve_worst_predictor.png b/docs/logistic_regression_2/images/roc_curve_worst_predictor.png
new file mode 100644
index 00000000..d2b47877
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve_worst_predictor.png differ
diff --git a/docs/logistic_regression_2/images/toy_2_point.png b/docs/logistic_regression_2/images/toy_2_point.png
new file mode 100644
index 00000000..a41390ce
Binary files /dev/null and b/docs/logistic_regression_2/images/toy_2_point.png differ
diff --git a/docs/logistic_regression_2/images/toy_3_point.png b/docs/logistic_regression_2/images/toy_3_point.png
new file mode 100644
index 00000000..cc8f9c1d
Binary files /dev/null and b/docs/logistic_regression_2/images/toy_3_point.png differ
diff --git a/docs/logistic_regression_2/images/tpr_fpr.png b/docs/logistic_regression_2/images/tpr_fpr.png
new file mode 100644
index 00000000..69d8df64
Binary files /dev/null and b/docs/logistic_regression_2/images/tpr_fpr.png differ
diff --git a/docs/logistic_regression_2/images/unreg_loss.png b/docs/logistic_regression_2/images/unreg_loss.png
new file mode 100644
index 00000000..616b61cd
Binary files /dev/null and b/docs/logistic_regression_2/images/unreg_loss.png differ
diff --git a/docs/logistic_regression_2/images/varying_threshold.png b/docs/logistic_regression_2/images/varying_threshold.png
new file mode 100644
index 00000000..e90113a4
Binary files /dev/null and b/docs/logistic_regression_2/images/varying_threshold.png differ
diff --git a/docs/logistic_regression_2/logistic_reg_2.html b/docs/logistic_regression_2/logistic_reg_2.html
new file mode 100644
index 00000000..f91b2489
--- /dev/null
+++ b/docs/logistic_regression_2/logistic_reg_2.html
@@ -0,0 +1,938 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.3.450">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>Principles and Techniques of Data Science - 23&nbsp; Logistic Regression II</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../logistic_regression_1/logistic_reg_1.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 20,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../logistic_regression_2/logistic_reg_2.html"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></a></li></ol></nav>
+      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+      </a>
+      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" rel="" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+    <a href="../Principles-and-Techniques-of-Data-Science.pdf" rel="" title="Download PDF" class="quarto-navigation-tool px-1" aria-label="Download PDF"><i class="bi bi-file-pdf"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Sklearn and Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Bias, Variance, and Inference</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#decision-boundaries" id="toc-decision-boundaries" class="nav-link active" data-scroll-target="#decision-boundaries"><span class="header-section-number">23.1</span> Decision Boundaries</a></li>
+  <li><a href="#linear-separability-and-regularization" id="toc-linear-separability-and-regularization" class="nav-link" data-scroll-target="#linear-separability-and-regularization"><span class="header-section-number">23.2</span> Linear Separability and Regularization</a>
+  <ul>
+  <li><a href="#regularized-logistic-regression" id="toc-regularized-logistic-regression" class="nav-link" data-scroll-target="#regularized-logistic-regression"><span class="header-section-number">23.2.1</span> Regularized Logistic Regression</a></li>
+  </ul></li>
+  <li><a href="#performance-metrics" id="toc-performance-metrics" class="nav-link" data-scroll-target="#performance-metrics"><span class="header-section-number">23.3</span> Performance Metrics</a>
+  <ul>
+  <li><a href="#types-of-classification" id="toc-types-of-classification" class="nav-link" data-scroll-target="#types-of-classification"><span class="header-section-number">23.3.1</span> Types of Classification</a></li>
+  <li><a href="#accuracy-precision-and-recall" id="toc-accuracy-precision-and-recall" class="nav-link" data-scroll-target="#accuracy-precision-and-recall"><span class="header-section-number">23.3.2</span> Accuracy, Precision, and Recall</a></li>
+  <li><a href="#example-calculation" id="toc-example-calculation" class="nav-link" data-scroll-target="#example-calculation"><span class="header-section-number">23.3.3</span> Example Calculation</a>
+  <ul>
+  <li><a href="#model-1" id="toc-model-1" class="nav-link" data-scroll-target="#model-1"><span class="header-section-number">23.3.3.1</span> Model 1</a></li>
+  <li><a href="#model-2" id="toc-model-2" class="nav-link" data-scroll-target="#model-2"><span class="header-section-number">23.3.3.2</span> Model 2</a></li>
+  </ul></li>
+  <li><a href="#precision-vs.-recall" id="toc-precision-vs.-recall" class="nav-link" data-scroll-target="#precision-vs.-recall"><span class="header-section-number">23.3.4</span> Precision vs.&nbsp;Recall</a></li>
+  <li><a href="#two-more-metrics" id="toc-two-more-metrics" class="nav-link" data-scroll-target="#two-more-metrics"><span class="header-section-number">23.3.5</span> Two More Metrics</a></li>
+  </ul></li>
+  <li><a href="#adjusting-the-classification-threshold" id="toc-adjusting-the-classification-threshold" class="nav-link" data-scroll-target="#adjusting-the-classification-threshold"><span class="header-section-number">23.4</span> Adjusting the Classification Threshold</a>
+  <ul>
+  <li><a href="#precision-recall-curves" id="toc-precision-recall-curves" class="nav-link" data-scroll-target="#precision-recall-curves"><span class="header-section-number">23.4.1</span> Precision-Recall Curves</a></li>
+  <li><a href="#the-roc-curve" id="toc-the-roc-curve" class="nav-link" data-scroll-target="#the-roc-curve"><span class="header-section-number">23.4.2</span> The ROC Curve</a>
+  <ul>
+  <li><a href="#extra-what-is-the-worst-auc-and-why-is-it-0.5" id="toc-extra-what-is-the-worst-auc-and-why-is-it-0.5" class="nav-link" data-scroll-target="#extra-what-is-the-worst-auc-and-why-is-it-0.5"><span class="header-section-number">23.4.2.1</span> [Extra] What is the “worst” AUC, and why is it 0.5?</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#extra-gradient-descent-for-logistic-regression" id="toc-extra-gradient-descent-for-logistic-regression" class="nav-link" data-scroll-target="#extra-gradient-descent-for-logistic-regression"><span class="header-section-number">23.5</span> [Extra] Gradient Descent for Logistic Regression</a>
+  <ul>
+  <li><a href="#gradient-descent-update-rule" id="toc-gradient-descent-update-rule" class="nav-link" data-scroll-target="#gradient-descent-update-rule"><span class="header-section-number">23.5.1</span> Gradient Descent Update Rule</a></li>
+  <li><a href="#stochastic-gradient-descent-update-rule" id="toc-stochastic-gradient-descent-update-rule" class="nav-link" data-scroll-target="#stochastic-gradient-descent-update-rule"><span class="header-section-number">23.5.2</span> Stochastic Gradient Descent Update Rule</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+</header>
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Apply decision rules to make a classification</li>
+<li>Learn when logistic regression works well and when it does not</li>
+<li>Introduce new metrics for model performance</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Today, we will continue studying the Logistic Regression model. We’ll discuss decision boundaries that help inform the classification of a particular prediction. Then, we’ll pick up from last lecture’s discussion of cross-entropy loss, study a few of its pitfalls, and learn potential remedies. We will also provide an implementation of <code>sklearn</code>’s logistic regression model. Lastly, we’ll return to decision rules and discuss metrics that allow us to determine our model’s performance in different scenarios.</p>
+<p>This will introduce us to the process of <strong>thresholding</strong> – a technique used to <em>classify</em> data from our model’s predicted probabilities, or <span class="math inline">\(P(Y=1|x)\)</span>. In doing so, we’ll focus on how these thresholding decisions affect the behavior of our model. We will learn various evaluation metrics useful for binary classification, and apply them to our study of logistic regression.</p>
+<center>
+<img src="images/log_reg_summary.png" alt="tpr_fpr" width="800">
+</center>
+<section id="decision-boundaries" class="level2" data-number="23.1">
+<h2 data-number="23.1" class="anchored" data-anchor-id="decision-boundaries"><span class="header-section-number">23.1</span> Decision Boundaries</h2>
+<p>In logistic regression, we model the <em>probability</em> that a datapoint belongs to Class 1. Last week, we developed the logistic regression model to predict that probability, but we never actually made any <em>classifications</em> for whether our prediction <span class="math inline">\(y\)</span> belongs in Class 0 or Class 1.</p>
+<p><span class="math display">\[ p = P(Y=1 | x) = \frac{1}{1 + e^{-x^T\theta}}\]</span></p>
+<p>A <strong>decision rule</strong> tells us how to interpret the output of the model to make a decision on how to classify a datapoint. We commonly make decision rules by specifying a <strong>threshold</strong>, <span class="math inline">\(T\)</span>. If the predicted probability is greater than or equal to <span class="math inline">\(T\)</span>, predict Class 1. Otherwise, predict Class 0.</p>
+<p><span class="math display">\[\hat y = \text{classify}(x) = \begin{cases}
+        1, &amp; P(Y=1|x) \ge T\\
+        0, &amp; \text{otherwise }
+    \end{cases}\]</span></p>
+<p>The threshold is often set to <span class="math inline">\(T = 0.5\)</span>, but <em>not always</em>. We’ll discuss why we might want to use other thresholds <span class="math inline">\(T \neq 0.5\)</span> later in this lecture.</p>
+<p>Using our decision rule, we can define a <strong>decision boundary</strong> as the “line” that splits the data into classes based on its features. For logistic regression, the decision boundary is a <strong>hyperplane</strong> – a linear combination of the features in p-dimensions – and we can recover it from the final logistic regression model. For example, if we have a model with 2 features (2D), we have <span class="math inline">\(\theta = [\theta_0, \theta_1, \theta_2]\)</span> including the intercept term, and we can solve for the decision boundary like so:</p>
+<p><span class="math display">\[
+\begin{align}
+T &amp;= \frac{1}{1 + e^{\theta_0 + \theta_1 * \text{feature1} +  \theta_2 * \text{feature2}}} \\
+1 + e^{\theta_0 + \theta_1 \cdot \text{feature1} +  \theta_2  \cdot  \text{feature2}} &amp;= \frac{1}{T} \\
+e^{\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2}} &amp;= \frac{1}{T} - 1 \\
+\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2} &amp;= \log(\frac{1}{T} - 1)
+\end{align}
+\]</span></p>
+<p>For a model with 2 features, the decision boundary is a line in terms of its features. To make it easier to visualize, we’ve included an example of a 1-dimensional and a 2-dimensional decision boundary below. Notice how the decision boundary predicted by our logistic regression model perfectly separates the points into two classes.</p>
+<center>
+<img src="images/decision_boundary.png" alt="varying_threshold" width="800">
+</center>
+<p>In real life, however, that is often not the case, and we often see some overlap between points of different classes across the decision boundary. The <em>true</em> classes of the 2D data are shown below:</p>
+<center>
+<img src="images/decision_boundary_true.png" alt="varying_threshold" width="400">
+</center>
+<p>As you can see, the decision boundary predicted by our logistic regression does not perfectly separate the two classes. There’s a “muddled” region near the decision boundary where our classifier predicts the wrong class. What would the data have to look like for the classifier to make perfect predictions?</p>
+</section>
+<section id="linear-separability-and-regularization" class="level2" data-number="23.2">
+<h2 data-number="23.2" class="anchored" data-anchor-id="linear-separability-and-regularization"><span class="header-section-number">23.2</span> Linear Separability and Regularization</h2>
+<p>A classification dataset is said to be <strong>linearly separable</strong> if there exists a hyperplane among input features <span class="math inline">\(x\)</span> that separates the two classes <span class="math inline">\(y\)</span>.</p>
+<p>Linear separability in 1D can be found with a rugplot of a single feature. For example, notice how the plot on the bottom left is linearly separable along the vertical line <span class="math inline">\(x=0\)</span>. However, no such line perfectly separates the two classes on the bottom right.</p>
+<center>
+<img src="images/linear_separability_1D.png" alt="linear_separability_1D" width="800">
+</center>
+<p>This same definition holds in higher dimensions. If there are two features, the separating hyperplane must exist in two dimensions (any line of the form <span class="math inline">\(y=mx+b\)</span>). We can visualize this using a scatter plot.</p>
+<center>
+<img src="images/linear_separability_2D.png" alt="linear_separability_1D" width="800">
+</center>
+<p>This sounds great! When the dataset is linearly separable, a logistic regression classifier can perfectly assign datapoints into classes. However, (unexpected) complications may arise. Consider the <code>toy</code> dataset with 2 points and only a single feature <span class="math inline">\(x\)</span>:</p>
+<center>
+<img src="images/toy_2_point.png" alt="toy_linear_separability" width="500">
+</center>
+<p>The optimal <span class="math inline">\(\theta\)</span> value that minimizes loss pushes the predicted probabilities of the data points to their true class.</p>
+<ul>
+<li><span class="math inline">\(P(Y = 1|x = -1) = \frac{1}{1 + e^\theta} \rightarrow 1\)</span></li>
+<li><span class="math inline">\(P(Y = 1|x = 1) = \frac{1}{1 + e^{-\theta}} \rightarrow 0\)</span></li>
+</ul>
+<p>This happens when <span class="math inline">\(\theta = -\infty\)</span>. When <span class="math inline">\(\theta = -\infty\)</span>, we observe the following behavior for any input <span class="math inline">\(x\)</span>.</p>
+<p><span class="math display">\[P(Y=1|x) = \sigma(\theta x) \rightarrow \begin{cases}
+        1, \text{if }  x &lt; 0\\
+        0, \text{if }  x \ge 0
+    \end{cases}\]</span></p>
+<p>The diverging weights cause the model to be overconfident. For example, consider the new point <span class="math inline">\((x, y) = (0.5, 1)\)</span>. Following the behavior above, our model will incorrectly predict <span class="math inline">\(p=0\)</span>, and thus, <span class="math inline">\(\hat y = 0\)</span>.</p>
+<center>
+<img src="images/toy_3_point.png" alt="toy_linear_separability" width="500">
+</center>
+<p>The loss incurred by this misclassified point is infinite.</p>
+<p><span class="math display">\[-(y\text{ log}(p) + (1-y)\text{ log}(1-p))=1\text{log}(0)\]</span></p>
+<p>Thus, diverging weights (<span class="math inline">\(|\theta| \rightarrow \infty\)</span>) occur with <strong>lineary separable</strong> data. “Overconfidence” is a particularly dangerous version of overfitting.</p>
+<p>Consider the loss function with respect to the parameter <span class="math inline">\(\theta\)</span>.</p>
+<center>
+<img src="images/unreg_loss.png" alt="unreg_loss" width="500">
+</center>
+<p>Though it’s very difficult to see, the plateau for negative values of <span class="math inline">\(\theta\)</span> is slightly tilted downwards, meaning the loss approaches <span class="math inline">\(0\)</span> as <span class="math inline">\(\theta\)</span> decreases and approaches <span class="math inline">\(-\infty\)</span>.</p>
+<section id="regularized-logistic-regression" class="level3" data-number="23.2.1">
+<h3 data-number="23.2.1" class="anchored" data-anchor-id="regularized-logistic-regression"><span class="header-section-number">23.2.1</span> Regularized Logistic Regression</h3>
+<p>To avoid large weights and infinite loss (particularly on linearly separable data), we use regularization. The same principles apply as with linear regression - make sure to standardize your features first.</p>
+<p>For example, <span class="math inline">\(L2\)</span> (Ridge) Logistic Regression takes on the form:</p>
+<p><span class="math display">\[\min_{\theta} -\frac{1}{n} \sum_{i=1}^{n} (y_i \text{log}(\sigma(x_i^T\theta)) + (1-y_i)\text{log}(1-\sigma(x_i^T\theta))) + \lambda \sum_{i=1}^{d} \theta_j^2\]</span></p>
+<p>Now, let us compare the loss functions of un-regularized and regularized logistic regression.</p>
+<center>
+<img src="images/unreg_loss.png" alt="unreg_loss" width="500">
+</center>
+<center>
+<img src="images/reg_loss.png" alt="reg_loss" width="500">
+</center>
+<p>As we can see, <span class="math inline">\(L2\)</span> regularization helps us prevent diverging weights and deters against “overconfidence.”</p>
+<p><code>sklearn</code>’s logistic regression defaults to L2 regularization and <code>C=1.0</code>; <code>C</code> is the inverse of <span class="math inline">\(\lambda\)</span>: <span class="math inline">\(C = \frac{1}{\lambda}\)</span>. Setting <code>C</code> to a large value, for example, <code>C=300.0</code>, results in minimal regularization.</p>
+<pre><code># sklearn defaults
+model = LogisticRegression(penalty='l2', C=1.0, …)
+model.fit()</code></pre>
+<p>Note that in Data 100, we only use <code>sklearn</code> to fit logistic regression models. There is no closed-form solution to the optimal theta vector, and the gradient is a little messy (see the bonus section below for details).</p>
+<p>From here, the <code>.predict</code> function returns the predicted class <span class="math inline">\(\hat y\)</span> of the point. In the simple binary case,</p>
+<p><span class="math display">\[\hat y = \begin{cases}
+        1, &amp; P(Y=1|x) \ge 0.5\\
+        0, &amp; \text{otherwise }
+    \end{cases}\]</span></p>
+</section>
+</section>
+<section id="performance-metrics" class="level2" data-number="23.3">
+<h2 data-number="23.3" class="anchored" data-anchor-id="performance-metrics"><span class="header-section-number">23.3</span> Performance Metrics</h2>
+<p>You might be thinking, if we’ve already introduced cross-entropy loss, why do we need additional ways of assessing how well our models perform? In linear regression, we made numerical predictions and used a loss function to determine how “good” these predictions were. In logistic regression, our ultimate goal is to classify data – we are much more concerned with whether or not each datapoint was assigned the correct class using the decision rule. As such, we are interested in the <em>quality</em> of classifications, not the predicted probabilities.</p>
+<p>The most basic evaluation metric is <strong>accuracy</strong>, that is, the proportion of correctly classified points.</p>
+<p><span class="math display">\[\text{accuracy} = \frac{\# \text{ of points classified correctly}}{\# \text{ of total points}}\]</span></p>
+<p>Translated to code:</p>
+<pre><code>def accuracy(X, Y):
+    return np.mean(model.predict(X) == Y)
+    
+model.score(X, y) # built-in accuracy function</code></pre>
+<p>However, accuracy is not always a great metric for classification. To understand why, let’s consider a classification problem with 100 emails where only 5 are truly spam, and the remaining 95 are truly ham. We’ll investigate two models where accuracy is a poor metric.</p>
+<ul>
+<li><strong>Model 1</strong>: Our first model classifies every email as non-spam. The model’s accuracy is high (<span class="math inline">\(\frac{95}{100} = 0.95\)</span>), but it doesn’t detect any spam emails. Despite the high accuracy, this is a bad model.</li>
+<li><strong>Model 2</strong>: The second model classifies every email as spam. The accuracy is low (<span class="math inline">\(\frac{5}{100} = 0.05\)</span>), but the model correctly labels every spam email. Unfortunately, it also misclassifies every non-spam email.</li>
+</ul>
+<p>As this example illustrates, accuracy is not always a good metric for classification, particularly when your data could exhibit class imbalance (e.g., very few 1’s compared to 0’s).</p>
+<section id="types-of-classification" class="level3" data-number="23.3.1">
+<h3 data-number="23.3.1" class="anchored" data-anchor-id="types-of-classification"><span class="header-section-number">23.3.1</span> Types of Classification</h3>
+<p>There are 4 different different classifications that our model might make:</p>
+<ol type="1">
+<li><strong>True positive</strong>: correctly classify a positive point as being positive (<span class="math inline">\(y=1\)</span> and <span class="math inline">\(\hat{y}=1\)</span>)</li>
+<li><strong>True negative</strong>: correctly classify a negative point as being negative (<span class="math inline">\(y=0\)</span> and <span class="math inline">\(\hat{y}=0\)</span>)</li>
+<li><strong>False positive</strong>: incorrectly classify a negative point as being positive (<span class="math inline">\(y=0\)</span> and <span class="math inline">\(\hat{y}=1\)</span>)</li>
+<li><strong>False negative</strong>: incorrectly classify a positive point as being negative (<span class="math inline">\(y=1\)</span> and <span class="math inline">\(\hat{y}=0\)</span>)</li>
+</ol>
+<p>These classifications can be concisely summarized in a <strong>confusion matrix</strong>.</p>
+<center>
+<img src="images/confusion_matrix.png" alt="confusion_matrix" width="500">
+</center>
+<p>An easy way to remember this terminology is as follows:</p>
+<ol type="1">
+<li>Look at the second word in the phrase. <em>Positive</em> means a prediction of 1. <em>Negative</em> means a prediction of 0.</li>
+<li>Look at the first word in the phrase. <em>True</em> means our prediction was correct. <em>False</em> means it was incorrect.</li>
+</ol>
+<p>We can now write the accuracy calculation as <span class="math display">\[\text{accuracy} = \frac{TP + TN}{n}\]</span></p>
+<p>In <code>sklearn</code>, we use the following syntax</p>
+<pre><code>from sklearn.metrics import confusion_matrix
+cm = confusion_matrix(Y_true, Y_pred)</code></pre>
+<center>
+<img src="images/confusion_matrix_sklearn.png" alt="confusion_matrix" width="300">
+</center>
+</section>
+<section id="accuracy-precision-and-recall" class="level3" data-number="23.3.2">
+<h3 data-number="23.3.2" class="anchored" data-anchor-id="accuracy-precision-and-recall"><span class="header-section-number">23.3.2</span> Accuracy, Precision, and Recall</h3>
+<p>The purpose of our discussion of the confusion matrix was to motivate better performance metrics for classification problems with class imbalance - namely, precision and recall.</p>
+<p><strong>Precision</strong> is defined as</p>
+<p><span class="math display">\[\text{precision} = \frac{\text{TP}}{\text{TP + FP}}\]</span></p>
+<p>Precision answers the question: “Of all observations that were predicted to be <span class="math inline">\(1\)</span>, what proportion was actually <span class="math inline">\(1\)</span>?” It measures how accurate the classifier is when its predictions are positive.</p>
+<p><strong>Recall</strong> (or <strong>sensitivity</strong>) is defined as</p>
+<p><span class="math display">\[\text{recall} = \frac{\text{TP}}{\text{TP + FN}}\]</span></p>
+<p>Recall aims to answer: “Of all observations that were actually <span class="math inline">\(1\)</span>, what proportion was predicted to be <span class="math inline">\(1\)</span>?” It measures how many positive predictions were missed.</p>
+<p>Here’s a helpful graphic that summarizes our discussion above.</p>
+<center>
+<img src="images/precision_recall_graphic.png" alt="confusion_matrix" width="700">
+</center>
+</section>
+<section id="example-calculation" class="level3" data-number="23.3.3">
+<h3 data-number="23.3.3" class="anchored" data-anchor-id="example-calculation"><span class="header-section-number">23.3.3</span> Example Calculation</h3>
+<p>In this section, we will calculate the accuracy, precision, and recall performance metrics for our earlier spam classification example. As a reminder, we had 100 emails, 5 of which were spam. We designed two models:</p>
+<ul>
+<li>Model 1: Predict that every email is <em>non-spam</em></li>
+<li>Model 2: Predict that every email is <em>spam</em></li>
+</ul>
+<section id="model-1" class="level4" data-number="23.3.3.1">
+<h4 data-number="23.3.3.1" class="anchored" data-anchor-id="model-1"><span class="header-section-number">23.3.3.1</span> Model 1</h4>
+<p>First, let’s begin by creating the confusion matrix.</p>
+<table class="table">
+<colgroup>
+<col style="width: 27%">
+<col style="width: 27%">
+<col style="width: 38%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>0</th>
+<th>1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>0</td>
+<td>True Negative: 95</td>
+<td>False Positive: 0</td>
+</tr>
+<tr class="even">
+<td>1</td>
+<td>False Negative: 5</td>
+<td>True Positive: 0</td>
+</tr>
+</tbody>
+</table>
+<p>Convince yourself of why our confusion matrix looks like so.</p>
+<p><span class="math display">\[\text{accuracy} = \frac{95}{100} = 0.95\]</span> <span class="math display">\[\text{precision} = \frac{0}{0 + 0} = \text{undefined}\]</span> <span class="math display">\[\text{recall} = \frac{0}{0 + 5} = 0\]</span></p>
+<p>Notice how our precision is undefined because we never predicted class <span class="math inline">\(1\)</span>. Our recall is 0 for the same reason – the numerator is 0 (we had no positive predictions).</p>
+</section>
+<section id="model-2" class="level4" data-number="23.3.3.2">
+<h4 data-number="23.3.3.2" class="anchored" data-anchor-id="model-2"><span class="header-section-number">23.3.3.2</span> Model 2</h4>
+<p>Our confusion matrix for Model 2 looks like so.</p>
+<table class="table">
+<colgroup>
+<col style="width: 27%">
+<col style="width: 27%">
+<col style="width: 38%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>0</th>
+<th>1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>0</td>
+<td>True Negative: 0</td>
+<td>False Positive: 95</td>
+</tr>
+<tr class="even">
+<td>1</td>
+<td>False Negative: 0</td>
+<td>True Positive: 5</td>
+</tr>
+</tbody>
+</table>
+<p><span class="math display">\[\text{accuracy} = \frac{5}{100} = 0.05\]</span> <span class="math display">\[\text{precision} = \frac{5}{5 + 95} = 0.05\]</span> <span class="math display">\[\text{recall} = \frac{5}{5 + 0} = 1\]</span></p>
+<p>Our precision is low because we have many false positives, and our recall is perfect - we correctly classified all spam emails (we never predicted class <span class="math inline">\(0\)</span>).</p>
+</section>
+</section>
+<section id="precision-vs.-recall" class="level3" data-number="23.3.4">
+<h3 data-number="23.3.4" class="anchored" data-anchor-id="precision-vs.-recall"><span class="header-section-number">23.3.4</span> Precision vs.&nbsp;Recall</h3>
+<p>Precision (<span class="math inline">\(\frac{\text{TP}}{\text{TP} + \textbf{ FP}}\)</span>) penalizes false positives, while recall (<span class="math inline">\(\frac{\text{TP}}{\text{TP} + \textbf{ FN}}\)</span>) penalizes false negatives.</p>
+<p>In fact, precision and recall are <em>inversely related</em>. This is evident in our second model – we observed a high recall and low precision. Usually, there is a tradeoff in these two (most models can either minimize the number of FP or FN; and in rare cases, both).</p>
+<p>The specific performance metric(s) to prioritize depends on the context. In many medical settings, there might be a much higher cost to missing positive cases. For instance, in our breast cancer example, it is more costly to misclassify malignant tumors (false negatives) than it is to incorrectly classify a benign tumor as malignant (false positives). In the case of the latter, pathologists can conduct further studies to verify malignant tumors. As such, we should minimize the number of false negatives. This is equivalent to maximizing recall.</p>
+</section>
+<section id="two-more-metrics" class="level3" data-number="23.3.5">
+<h3 data-number="23.3.5" class="anchored" data-anchor-id="two-more-metrics"><span class="header-section-number">23.3.5</span> Two More Metrics</h3>
+<p>The <strong>True Positive Rate (TPR)</strong> is defined as</p>
+<p><span class="math display">\[\text{true positive rate} = \frac{\text{TP}}{\text{TP + FN}}\]</span></p>
+<p>You’ll notice this is equivalent to <em>recall</em>. In the context of our spam email classifier, it answers the question: “What proportion of spam did I mark correctly?”. We’d like this to be close to <span class="math inline">\(1\)</span></p>
+<p>The <strong>False Positive Rate (FPR)</strong> is defined as</p>
+<p><span class="math display">\[\text{false positive rate} = \frac{\text{FP}}{\text{FP + TN}}\]</span></p>
+<p>Another word for FPR is <em>specificity</em>. This answers the question: “What proportion of regular email did I mark as spam?”. We’d like this to be close to <span class="math inline">\(0\)</span></p>
+<p>As we increase threshold <span class="math inline">\(T\)</span>, both TPR and FPR decrease. We’ve plotted this relationship below for some model on a <code>toy</code> dataset.</p>
+<center>
+<img src="images/tpr_fpr.png" alt="tpr_fpr" width="800">
+</center>
+</section>
+</section>
+<section id="adjusting-the-classification-threshold" class="level2" data-number="23.4">
+<h2 data-number="23.4" class="anchored" data-anchor-id="adjusting-the-classification-threshold"><span class="header-section-number">23.4</span> Adjusting the Classification Threshold</h2>
+<p>One way to minimize the number of FP vs.&nbsp;FN (equivalently, maximizing precision vs.&nbsp;recall) is by adjusting the classification threshold <span class="math inline">\(T\)</span>.</p>
+<p><span class="math display">\[\hat y = \begin{cases}
+        1, &amp; P(Y=1|x) \ge T\\
+        0, &amp; \text{otherwise }
+    \end{cases}\]</span></p>
+<p>The default threshold in <code>sklearn</code> is <span class="math inline">\(T = 0.5\)</span>. As we increase the threshold <span class="math inline">\(T\)</span>, we “raise the standard” of how confident our classifier needs to be to predict 1 (i.e., “positive”).</p>
+<center>
+<img src="images/varying_threshold.png" alt="varying_threshold" width="800">
+</center>
+<p>As you may notice, the choice of threshold <span class="math inline">\(T\)</span> impacts our classifier’s performance.</p>
+<ul>
+<li>High <span class="math inline">\(T\)</span>: Most predictions are <span class="math inline">\(0\)</span>.
+<ul>
+<li>Lots of false negatives</li>
+<li>Fewer false positives</li>
+</ul></li>
+<li>Low <span class="math inline">\(T\)</span>: Most predictions are <span class="math inline">\(1\)</span>.
+<ul>
+<li>Lots of false positives</li>
+<li>Fewer false negatives</li>
+</ul></li>
+</ul>
+<p>In fact, we can choose a threshold <span class="math inline">\(T\)</span> based on our desired number, or proportion, of false positives and false negatives. We can do so using a few different tools. We’ll touch on two of the most important ones in Data 100.</p>
+<ol type="1">
+<li>Precision-Recall Curve (PR Curve)</li>
+<li>“Receiver Operating Characteristic” Curve (ROC Curve)</li>
+</ol>
+<section id="precision-recall-curves" class="level3" data-number="23.4.1">
+<h3 data-number="23.4.1" class="anchored" data-anchor-id="precision-recall-curves"><span class="header-section-number">23.4.1</span> Precision-Recall Curves</h3>
+<p>A <strong>Precision-Recall Curve (PR Curve)</strong> is an alternative to the ROC curve that displays the relationship between precision and recall for various threshold values. It is constructed in a similar way as with the ROC curve.</p>
+<p>Let’s first consider how precision and recall change as a function of the threshold <span class="math inline">\(T\)</span>. We know this quite well from earlier – precision will generally increase, and recall will decrease.</p>
+<center>
+<img src="images/precision-recall-thresh.png" alt="precision-recall-thresh" width="750">
+</center>
+<p>Displayed below is the PR Curve for the same <code>toy</code> dataset. Notice how threshold values increase as we move to the left.</p>
+<center>
+<img src="images/pr_curve_thresholds.png" alt="pr_curve_thresholds" width="685">
+</center>
+<p>Once again, the perfect classifier will resemble the orange curve, this time, facing the opposite direction.</p>
+<center>
+<img src="images/pr_curve_perfect.png" alt="pr_curve_perfect" width="675">
+</center>
+<p>We want our PR curve to be as close to the “top right” of this graph as possible. Again, we use the AUC to determine “closeness”, with the perfect classifier exhibiting an AUC = 1 (and the worst with an AUC = 0.5).</p>
+</section>
+<section id="the-roc-curve" class="level3" data-number="23.4.2">
+<h3 data-number="23.4.2" class="anchored" data-anchor-id="the-roc-curve"><span class="header-section-number">23.4.2</span> The ROC Curve</h3>
+<p>The “Receiver Operating Characteristic” Curve (<strong>ROC Curve</strong>) plots the tradeoff between FPR and TPR. Notice how the far-left of the curve corresponds to higher threshold <span class="math inline">\(T\)</span> values.</p>
+<center>
+<img src="images/roc_curve.png" alt="roc_curve" width="700">
+</center>
+<p>The “perfect” classifier is the one that has a TPR of 1, and FPR of 0. This is achieved at the top-left of the plot below. More generally, it’s ROC curve resembles the curve in orange.</p>
+<center>
+<img src="images/roc_curve_perfect.png" alt="roc_curve_perfect" width="700">
+</center>
+<p>We want our model to be as close to this orange curve as possible. How do we quantify “closeness”?</p>
+<p>We can compute the <strong>area under curve (AUC)</strong> of the ROC curve. Notice how the perfect classifier has an AUC = 1. The closer our model’s AUC is to 1, the better it is.</p>
+<section id="extra-what-is-the-worst-auc-and-why-is-it-0.5" class="level4" data-number="23.4.2.1">
+<h4 data-number="23.4.2.1" class="anchored" data-anchor-id="extra-what-is-the-worst-auc-and-why-is-it-0.5"><span class="header-section-number">23.4.2.1</span> [Extra] What is the “worst” AUC, and why is it 0.5?</h4>
+<p>On the other hand, a terrible model will have an AUC closer to 0.5. Random predictors randomly predict <span class="math inline">\(P(Y = 1 | x)\)</span> to be uniformly between 0 and 1. This indicates the classifier is not able to distinguish between positive and negative classes, and thus, randomly predicts one of the two.</p>
+<center>
+<img src="images/roc_curve_worst_predictor.png" alt="roc_curve_worst_predictor" width="900">
+</center>
+</section>
+</section>
+</section>
+<section id="extra-gradient-descent-for-logistic-regression" class="level2" data-number="23.5">
+<h2 data-number="23.5" class="anchored" data-anchor-id="extra-gradient-descent-for-logistic-regression"><span class="header-section-number">23.5</span> [Extra] Gradient Descent for Logistic Regression</h2>
+<p>Let’s define the following: <span class="math display">\[
+t_i = \phi(x_i)^T \theta \\
+p_i = \sigma(t_i) \\
+t_i = \log(\frac{p_i}{1 - p_i}) \\
+1 - \sigma(t_i) = \sigma(-t_i) \\
+\frac{d}{dt}  \sigma(t) =  \sigma(t) \sigma(-t)
+\]</span></p>
+<p>Now, we can simplify the cross-entropy loss <span class="math display">\[
+\begin{align}
+y_i \log(p_i) + (1 - y_i) \log(1 - p_i) &amp;= y_i \log(\frac{p_i}{1 - p_i}) + \log(1 - p_i) \\
+&amp;= y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta))
+\end{align}
+\]</span></p>
+<p>Hence, the optimal <span class="math inline">\(\hat{\theta}\)</span> is <span class="math display">\[\text{argmin}_{\theta} - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]</span></p>
+<p>We want to minimize <span class="math display">\[L(\theta) = - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]</span></p>
+<p>So we take the derivative <span class="math display">\[
+\begin{align}
+\triangledown_{\theta} L(\theta) &amp;= - \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} y_i \phi(x_i)^T + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{1}{\sigma(-\phi(x_i)^T \theta)} \triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{\sigma(-\phi(x_i)^T \theta)}{\sigma(-\phi(x_i)^T \theta)} \sigma(\phi(x_i)^T \theta)\triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n (y_i - \sigma(\phi(x_i)^T \theta)\phi(x_i))
+\end{align}
+\]</span></p>
+<p>Setting the derivative equal to 0 and solving for <span class="math inline">\(\hat{\theta}\)</span>, we find that there’s no general analytic solution. Therefore, we must solve using numeric methods.</p>
+<section id="gradient-descent-update-rule" class="level3" data-number="23.5.1">
+<h3 data-number="23.5.1" class="anchored" data-anchor-id="gradient-descent-update-rule"><span class="header-section-number">23.5.1</span> Gradient Descent Update Rule</h3>
+<p><span class="math display">\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]</span></p>
+<p>For <span class="math inline">\(\tau\)</span> from 0 to convergence: <span class="math display">\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} + \rho(\tau)\left( \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]</span></p>
+</section>
+<section id="stochastic-gradient-descent-update-rule" class="level3" data-number="23.5.2">
+<h3 data-number="23.5.2" class="anchored" data-anchor-id="stochastic-gradient-descent-update-rule"><span class="header-section-number">23.5.2</span> Stochastic Gradient Descent Update Rule</h3>
+<p><span class="math display">\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]</span></p>
+<p>For <span class="math inline">\(\tau\)</span> from 0 to convergence, let <span class="math inline">\(B\)</span> ~ <span class="math inline">\(\text{Random subset of indices}\)</span>. <span class="math display">\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} + \rho(\tau)\left( \frac{1}{|B|} \sum_{i \in B} \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]</span></p>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button', {
+    text: function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+  });
+  clipboard.on('success', function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  });
+  function tippyHover(el, contentFn) {
+    const config = {
+      allowHTML: true,
+      content: contentFn,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start'
+    };
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      return note.innerHTML;
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../logistic_regression_1/logistic_reg_1.html" class="pagination-link">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/ols/ols.html b/docs/ols/ols.html
index a7795be2..add8bb30 100644
--- a/docs/ols/ols.html
+++ b/docs/ols/ols.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/pandas_1/pandas_1.html b/docs/pandas_1/pandas_1.html
index a6fe9c55..fe2deb17 100644
--- a/docs/pandas_1/pandas_1.html
+++ b/docs/pandas_1/pandas_1.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/pandas_2/pandas_2.html b/docs/pandas_2/pandas_2.html
index 50846454..888ca981 100644
--- a/docs/pandas_2/pandas_2.html
+++ b/docs/pandas_2/pandas_2.html
@@ -281,6 +281,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/pandas_3/pandas_3.html b/docs/pandas_3/pandas_3.html
index 6389327e..f4dc7c56 100644
--- a/docs/pandas_3/pandas_3.html
+++ b/docs/pandas_3/pandas_3.html
@@ -297,6 +297,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/probability_1/probability_1.html b/docs/probability_1/probability_1.html
index 7ebe1188..1a975de7 100644
--- a/docs/probability_1/probability_1.html
+++ b/docs/probability_1/probability_1.html
@@ -280,6 +280,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/probability_2/probability_2.html b/docs/probability_2/probability_2.html
index 8e98b0ee..5fcce79e 100644
--- a/docs/probability_2/probability_2.html
+++ b/docs/probability_2/probability_2.html
@@ -280,6 +280,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/regex/regex.html b/docs/regex/regex.html
index e037998e..612e6172 100644
--- a/docs/regex/regex.html
+++ b/docs/regex/regex.html
@@ -281,6 +281,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/sampling/sampling.html b/docs/sampling/sampling.html
index b2152c82..aa9d1cf4 100644
--- a/docs/sampling/sampling.html
+++ b/docs/sampling/sampling.html
@@ -281,6 +281,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/sql_I/sql_I.html b/docs/sql_I/sql_I.html
index 4bdf471f..3878e087 100644
--- a/docs/sql_I/sql_I.html
+++ b/docs/sql_I/sql_I.html
@@ -281,6 +281,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/sql_II/sql_II.html b/docs/sql_II/sql_II.html
index 9de97581..380838f3 100644
--- a/docs/sql_II/sql_II.html
+++ b/docs/sql_II/sql_II.html
@@ -281,6 +281,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/visualization_1/visualization_1.html b/docs/visualization_1/visualization_1.html
index c8b2faad..64d7b9b6 100644
--- a/docs/visualization_1/visualization_1.html
+++ b/docs/visualization_1/visualization_1.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/visualization_2/visualization_2.html b/docs/visualization_2/visualization_2.html
index 702256a8..9b7c6c58 100644
--- a/docs/visualization_2/visualization_2.html
+++ b/docs/visualization_2/visualization_2.html
@@ -283,6 +283,12 @@
   <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/index.log b/index.log
index d5c62e27..b389f684 100644
--- a/index.log
+++ b/index.log
@@ -1,4 +1,4 @@
-This is XeTeX, Version 3.141592653-2.6-0.999995 (TeX Live 2023) (preloaded format=xelatex 2023.11.3)  9 NOV 2023 18:34
+This is XeTeX, Version 3.141592653-2.6-0.999995 (TeX Live 2023) (preloaded format=xelatex 2023.11.3)  10 NOV 2023 10:05
 entering extended mode
  restricted \write18 enabled.
  %&-line parsing enabled.
diff --git a/index.pdf b/index.pdf
index fb9ddd11..1813bd41 100644
Binary files a/index.pdf and b/index.pdf differ
diff --git a/index.tex b/index.tex
index e47464d4..32c3e0bb 100644
--- a/index.tex
+++ b/index.tex
@@ -220,7 +220,7 @@
 
 \begin{document}
 \maketitle
-\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[frame hidden, sharp corners, enhanced, boxrule=0pt, breakable, borderline west={3pt}{0pt}{shadecolor}, interior hidden]}{\end{tcolorbox}}\fi
+\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[sharp corners, enhanced, borderline west={3pt}{0pt}{shadecolor}, frame hidden, interior hidden, boxrule=0pt, breakable]}{\end{tcolorbox}}\fi
 
 \renewcommand*\contentsname{Table of contents}
 {
@@ -258,7 +258,7 @@ \section*{About the Course Notes}\label{about-the-course-notes}}
 \hypertarget{introduction}{%
 \chapter{Introduction}\label{introduction}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -304,7 +304,7 @@ \chapter{Introduction}\label{introduction}}
 allowing you to take data and produce useful insights on the world's
 most challenging and ambiguous problems.
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Course Goals}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Course Goals}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -320,7 +320,7 @@ \chapter{Introduction}\label{introduction}}
 
 \end{tcolorbox}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Some Topics We'll Cover}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Some Topics We'll Cover}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -567,7 +567,7 @@ \section{Conclusion}\label{conclusion}}
 \hypertarget{pandas-i}{%
 \chapter{Pandas I}\label{pandas-i}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -3398,7 +3398,7 @@ \section{Parting Note}\label{parting-note}}
 \hypertarget{pandas-ii}{%
 \chapter{Pandas II}\label{pandas-ii}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -45939,7 +45939,7 @@ \section{Parting Note}\label{parting-note-1}}
 \hypertarget{pandas-iii}{%
 \chapter{Pandas III}\label{pandas-iii}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -61137,7 +61137,7 @@ \chapter{Data Cleaning and EDA}\label{data-cleaning-and-eda}}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -63674,7 +63674,7 @@ \section{EDA and Data Wrangling}\label{eda-and-data-wrangling}}
 \hypertarget{regular-expressions}{%
 \chapter{Regular Expressions}\label{regular-expressions}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -64965,7 +64965,7 @@ \section{Limitations of Regular
 \hypertarget{visualization-i}{%
 \chapter{Visualization I}\label{visualization-i}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -65784,7 +65784,7 @@ \subsection{Modes}\label{modes}}
 \hypertarget{visualization-ii}{%
 \chapter{Visualization II}\label{visualization-ii}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -67242,7 +67242,7 @@ \subsection{Harnessing Context}\label{harnessing-context}}
 \hypertarget{sampling}{%
 \chapter{Sampling}\label{sampling}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -67931,7 +67931,7 @@ \section{Summary}\label{summary-1}}
 \hypertarget{introduction-to-modeling}{%
 \chapter{Introduction to Modeling}\label{introduction-to-modeling}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -68294,7 +68294,7 @@ \subsection{Derivation}\label{derivation}}
   \(\hat{a} = \text{average of }y - \text{slope}\cdot\text{average of }x\)
 \end{itemize}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacityback=0, left=2mm, arc=.35mm, rightrule=.15mm, leftrule=.75mm, bottomrule=.15mm]
+\begin{tcolorbox}[enhanced jigsaw, opacityback=0, arc=.35mm, colback=white, rightrule=.15mm, toprule=.15mm, bottomrule=.15mm, leftrule=.75mm, left=2mm, breakable]
 
 Proof:
 
@@ -68646,7 +68646,7 @@ \section{Fitting the Model}\label{fitting-the-model}}
 \chapter{Constant Model, Loss, and
 Transformations}\label{constant-model-loss-and-transformations}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -69840,7 +69840,7 @@ \section{Transformations to fit Linear
 \hypertarget{ordinary-least-squares}{%
 \chapter{Ordinary Least Squares}\label{ordinary-least-squares}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -70470,7 +70470,7 @@ \section{OLS Properties}\label{ols-properties}}
 
 \[\mathbb{X}^Te = 0 \]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacityback=0, left=2mm, arc=.35mm, rightrule=.15mm, leftrule=.75mm, bottomrule=.15mm]
+\begin{tcolorbox}[enhanced jigsaw, opacityback=0, arc=.35mm, colback=white, rightrule=.15mm, toprule=.15mm, bottomrule=.15mm, leftrule=.75mm, left=2mm, breakable]
 
 Proof:
 
@@ -70508,7 +70508,7 @@ \section{OLS Properties}\label{ols-properties}}
 
 \[\sum_i^n e_i = 0\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacityback=0, left=2mm, arc=.35mm, rightrule=.15mm, leftrule=.75mm, bottomrule=.15mm]
+\begin{tcolorbox}[enhanced jigsaw, opacityback=0, arc=.35mm, colback=white, rightrule=.15mm, toprule=.15mm, bottomrule=.15mm, leftrule=.75mm, left=2mm, breakable]
 
 Proof:
 
@@ -70537,7 +70537,7 @@ \section{OLS Properties}\label{ols-properties}}
   only if \(\mathbb{X}\) is \textbf{full column rank}.
 \end{enumerate}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacityback=0, left=2mm, arc=.35mm, rightrule=.15mm, leftrule=.75mm, bottomrule=.15mm]
+\begin{tcolorbox}[enhanced jigsaw, opacityback=0, arc=.35mm, colback=white, rightrule=.15mm, toprule=.15mm, bottomrule=.15mm, leftrule=.75mm, left=2mm, breakable]
 
 Proof:
 
@@ -70619,7 +70619,7 @@ \section{OLS Properties}\label{ols-properties}}
 \hypertarget{gradient-descent}{%
 \chapter{Gradient Descent}\label{gradient-descent}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -71351,7 +71351,7 @@ \section{Batch, Mini-Batch Gradient Descent and Stochastic Gradient
 \chapter{Sklearn and Feature
 Engineering}\label{sklearn-and-feature-engineering}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72163,7 +72163,7 @@ \section{Complexity and Overfitting}\label{complexity-and-overfitting}}
 \chapter{Case Study in Human Contexts and
 Ethics}\label{case-study-in-human-contexts-and-ethics}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72354,7 +72354,7 @@ \section{The Response: Cook County Open Data
 \subsection{Question/Problem
 Formulation}\label{questionproblem-formulation}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72400,7 +72400,7 @@ \subsection{Question/Problem
   \end{itemize}
 \end{enumerate}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Definitions: Fairness and Transparency}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Definitions: Fairness and Transparency}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 The definitions, as given by the Cook County Assessor's Office, are
 given below:
@@ -72451,7 +72451,7 @@ \subsection{Question/Problem
 \subsection{Data Acquisition and
 Cleaning}\label{data-acquisition-and-cleaning}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72498,7 +72498,7 @@ \subsection{Data Acquisition and
 \hypertarget{exploratory-data-analysis}{%
 \subsection{Exploratory Data Analysis}\label{exploratory-data-analysis}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72543,7 +72543,7 @@ \subsection{Exploratory Data Analysis}\label{exploratory-data-analysis}}
 \hypertarget{prediction-and-inference}{%
 \subsection{Prediction and Inference}\label{prediction-and-inference}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72584,7 +72584,7 @@ \subsection{Prediction and Inference}\label{prediction-and-inference}}
 \subsection{Reports Decisions, and
 Conclusions}\label{reports-decisions-and-conclusions}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -72724,7 +72724,7 @@ \section{Lessons for Data Science
 \chapter{Cross Validation and
 Regularization}\label{cross-validation-and-regularization}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -73478,7 +73478,7 @@ \section{Regression Summary}\label{regression-summary}}
 \hypertarget{random-variables}{%
 \chapter{Random Variables}\label{random-variables}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -73521,7 +73521,7 @@ \chapter{Random Variables}\label{random-variables}}
   perspective to investigate our choice of model complexity
 \end{enumerate}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Data 8 Recap}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Data 8 Recap}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 Recall the following concepts from Data 8:
 
@@ -73763,7 +73763,7 @@ \subsection{Variance}\label{variance}}
 
 \[\text{Var}(X) = \mathbb{E}[X^2] - (\mathbb{E}[X])^2\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \[\begin{align}
    \text{Var}(X) &= \mathbb{E}[(X-\mathbb{E}[X])^2] \\
@@ -73791,7 +73791,7 @@ \subsection{Example: Dice}\label{example-dice}}
       0, \text{otherwise} 
    \end{cases}\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-caution-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-caution-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{What's the expectation \(\mathbb{E}[X]?\)}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-caution-color-frame, colbacktitle=quarto-callout-caution-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{What's the expectation \(\mathbb{E}[X]?\)}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \[ \begin{align} 
          \mathbb{E}[X] &= 1(\frac{1}{6}) + 2(\frac{1}{6}) + 3(\frac{1}{6}) + 4(\frac{1}{6}) + 5(\frac{1}{6}) + 6(\frac{1}{6}) \\
@@ -73801,7 +73801,7 @@ \subsection{Example: Dice}\label{example-dice}}
 
 \end{tcolorbox}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-caution-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-caution-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{What's the variance \(\text{Var}(X)?\)}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-caution-color-frame, colbacktitle=quarto-callout-caution-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{What's the variance \(\text{Var}(X)?\)}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 Using approach 1: \[\begin{align} 
       \text{Var}(X) &= (\frac{1}{6})((1 - \frac{7}{2})^2 + (2 - \frac{7}{2})^2 + (3 - \frac{7}{2})^2 + (4 - \frac{7}{2})^2 + (5 - \frac{7}{2})^2 + (6 - \frac{7}{2})^2) \\
@@ -73900,7 +73900,7 @@ \subsection{Properties of Expectation}\label{properties-of-expectation}}
 
 \[\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \[\begin{align}
         \mathbb{E}[aX+b] &= \sum_{x} (ax + b) P(X=x) \\
@@ -73921,7 +73921,7 @@ \subsection{Properties of Expectation}\label{properties-of-expectation}}
 
 \[\mathbb{E}[X+Y] = \mathbb{E}[X] + \mathbb{E}[Y]\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \[\begin{align}
     \mathbb{E}[X+Y] &= \sum_{s} (X+Y)(s) P(s) \\
@@ -73971,7 +73971,7 @@ \subsection{Properties of Variance}\label{properties-of-variance}}
   \(X\) by \(b\) units.
 \end{itemize}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 We know that \[\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\]
 
@@ -74009,7 +74009,7 @@ \subsection{Properties of Variance}\label{properties-of-variance}}
   \[\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) \qquad \text{if } X, Y \text{ independent}\]
 \end{enumerate}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 The variance of a sum is affected by the dependence between the two
 random variables that are being added. Let's expand out the definition
@@ -74094,7 +74094,7 @@ \subsection{Summary}\label{summary-2}}
 \chapter{Estimators, Bias, and
 Variance}\label{estimators-bias-and-variance}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -74244,7 +74244,7 @@ \subsection{Example}\label{example}}
 
 C. \(Y_C = 20 * X_1\)
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-caution-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-caution-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{Solution}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-caution-color-frame, colbacktitle=quarto-callout-caution-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{Solution}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 Let \(X_1, X_2, ... X_{20}\) be 20 i.i.d Bernoulli(0.5) random
 variables. Since the \(X_i\)'s are independent,
@@ -74405,7 +74405,7 @@ \subsection{Using the Sample Mean to Estimate the Population
 \textbf{unbiased estimator} of the population mean and will explore this
 idea more in the next lecture.
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Data 8 Recap: Square Root Law}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Data 8 Recap: Square Root Law}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 The square root law
 (\href{https://inferentialthinking.com/chapters/14/5/Variability_of_the_Sample_Mean.html\#the-square-root-law}{Data
@@ -74511,7 +74511,7 @@ \subsubsection{Estimating a Linear
 modeled by \[Y = g(x) + \epsilon\]
 \[ f_{\theta}(x) = Y = \theta_0 + \sum_{j=1}^p \theta_j x_j + \epsilon\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-warning-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-warning-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Which Expressions are random?}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-warning-color-frame, colbacktitle=quarto-callout-warning-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Which Expressions are random?}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 In our two equations above, the true relationship
 \(g(x) = \theta_0 + \sum_{j=1}^p \theta_j x_j\) is not random, but
@@ -74526,7 +74526,7 @@ \subsubsection{Estimating a Linear
 use it to train a model and obtain an estimate of \(\hat{\theta}\)
 \[\hat{Y}(x) = f_{\hat{\theta}}(x) = \hat{\theta_0} + \sum_{j=1}^p \hat{\theta_j} x_j\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-warning-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-warning-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Which Expressions are random?}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-warning-color-frame, colbacktitle=quarto-callout-warning-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Which Expressions are random?}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 In our estimating equation above, our sample \(\Bbb{X}\), \(\Bbb{Y}\)
 are random. Hence, the estimates we calculate from our samples
@@ -74578,7 +74578,7 @@ \section{Bootstrap Resampling
 list of estimates is the bootstrapped sampling distribution of f
 \end{verbatim}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-warning-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-warning-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Why must we resample \emph{with replacement}?}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-warning-color-frame, colbacktitle=quarto-callout-warning-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Why must we resample \emph{with replacement}?}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 Given an original sample of size \(n\), we want a resample that has the
 same size \(n\) as the original. Sampling \emph{without} replacement
@@ -74618,7 +74618,7 @@ \section{Bootstrap Resampling
 \chapter{Bias, Variance, and
 Inference}\label{bias-variance-and-inference}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -74846,7 +74846,7 @@ \subsubsection{Model Bias}\label{model-bias}}
 \(g(x)\); if it's negative, our model tends to underestimate \(g(x)\).
 And if it's 0, we can say that our model is \textbf{unbiased}.
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Unbiased Estimators}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Unbiased Estimators}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 An \textbf{unbiased model} has a \(\text{model bias } = 0\). In other
 words, our model predicts \(g(x)\) on average.
@@ -75872,7 +75872,7 @@ \section{(Bonus) Proof of Bias-Variance
 Decomposition in the Bias-Variance Tradeoff section earlier in this
 note.
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacityback=0, left=2mm, arc=.35mm, rightrule=.15mm, leftrule=.75mm, bottomrule=.15mm]
+\begin{tcolorbox}[enhanced jigsaw, opacityback=0, arc=.35mm, colback=white, rightrule=.15mm, toprule=.15mm, bottomrule=.15mm, leftrule=.75mm, left=2mm, breakable]
 
 \textbf{Click to show}\vspace{2mm}
 
@@ -76020,7 +76020,7 @@ \subsection{Step 4: Bias-Variance
 \hypertarget{sql-i}{%
 \chapter{SQL I}\label{sql-i}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -76979,7 +76979,7 @@ \section{\texorpdfstring{Aggregating with
 \hypertarget{sql-ii}{%
 \chapter{SQL II}\label{sql-ii}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -77525,7 +77525,7 @@ \section{\texorpdfstring{\texttt{JOIN}ing
 \hypertarget{logistic-regression-i}{%
 \chapter{Logistic Regression I}\label{logistic-regression-i}}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-note-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-note-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -77980,7 +77980,7 @@ \section{Deriving the Logistic Regression
 
 \[\sigma(t) = \frac{1}{1+e^{-t}}\]
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Properties of the Sigmoid}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Properties of the Sigmoid}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 \begin{itemize}
 \tightlist
@@ -78032,7 +78032,7 @@ \section{Deriving the Logistic Regression
 \hat{P}_{\theta}(Y = 1 | x) = \sigma(x^{\top}\theta)
 \end{align}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacitybacktitle=0.6, opacityback=0, left=2mm, colframe=quarto-callout-tip-color-frame, titlerule=0mm, bottomtitle=1mm, colbacktitle=quarto-callout-tip-color!10!white, bottomrule=.15mm, toptitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Properties of the Logistic Model}, arc=.35mm, rightrule=.15mm, leftrule=.75mm, coltitle=black]
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-tip-color-frame, colbacktitle=quarto-callout-tip-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Properties of the Logistic Model}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
 
 Consider a logistic regression model with one feature and an intercept
 term:
@@ -78057,7 +78057,7 @@ \section{Deriving the Logistic Regression
 
 \end{tcolorbox}
 
-\begin{tcolorbox}[enhanced jigsaw, toprule=.15mm, colback=white, breakable, opacityback=0, left=2mm, arc=.35mm, rightrule=.15mm, leftrule=.75mm, bottomrule=.15mm]
+\begin{tcolorbox}[enhanced jigsaw, opacityback=0, arc=.35mm, colback=white, rightrule=.15mm, toprule=.15mm, bottomrule=.15mm, leftrule=.75mm, left=2mm, breakable]
 
 \textbf{Example Calculation}\vspace{2mm}
 
@@ -78496,6 +78496,621 @@ \subsection{Likelihood of Data}\label{likelihood-of-data}}
 probability and ML classes, you'll get the chance to explore MLE
 further.
 
+\bookmarksetup{startatroot}
+
+\hypertarget{logistic-regression-ii}{%
+\chapter{Logistic Regression II}\label{logistic-regression-ii}}
+
+\begin{tcolorbox}[enhanced jigsaw, bottomrule=.15mm, colback=white, breakable, colframe=quarto-callout-note-color-frame, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, arc=.35mm, opacitybacktitle=0.6, rightrule=.15mm, toprule=.15mm, toptitle=1mm, bottomtitle=1mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, titlerule=0mm, leftrule=.75mm, left=2mm, coltitle=black]
+
+\begin{itemize}
+\tightlist
+\item
+  Apply decision rules to make a classification
+\item
+  Learn when logistic regression works well and when it does not
+\item
+  Introduce new metrics for model performance
+\end{itemize}
+
+\end{tcolorbox}
+
+Today, we will continue studying the Logistic Regression model. We'll
+discuss decision boundaries that help inform the classification of a
+particular prediction. Then, we'll pick up from last lecture's
+discussion of cross-entropy loss, study a few of its pitfalls, and learn
+potential remedies. We will also provide an implementation of
+\texttt{sklearn}'s logistic regression model. Lastly, we'll return to
+decision rules and discuss metrics that allow us to determine our
+model's performance in different scenarios.
+
+This will introduce us to the process of \textbf{thresholding} -- a
+technique used to \emph{classify} data from our model's predicted
+probabilities, or \(P(Y=1|x)\). In doing so, we'll focus on how these
+thresholding decisions affect the behavior of our model. We will learn
+various evaluation metrics useful for binary classification, and apply
+them to our study of logistic regression.
+
+\hypertarget{decision-boundaries}{%
+\section{Decision Boundaries}\label{decision-boundaries}}
+
+In logistic regression, we model the \emph{probability} that a datapoint
+belongs to Class 1. Last week, we developed the logistic regression
+model to predict that probability, but we never actually made any
+\emph{classifications} for whether our prediction \(y\) belongs in Class
+0 or Class 1.
+
+\[ p = P(Y=1 | x) = \frac{1}{1 + e^{-x^T\theta}}\]
+
+A \textbf{decision rule} tells us how to interpret the output of the
+model to make a decision on how to classify a datapoint. We commonly
+make decision rules by specifying a \textbf{threshold}, \(T\). If the
+predicted probability is greater than or equal to \(T\), predict Class
+1. Otherwise, predict Class 0.
+
+\[\hat y = \text{classify}(x) = \begin{cases}
+        1, & P(Y=1|x) \ge T\\
+        0, & \text{otherwise }
+    \end{cases}\]
+
+The threshold is often set to \(T = 0.5\), but \emph{not always}. We'll
+discuss why we might want to use other thresholds \(T \neq 0.5\) later
+in this lecture.
+
+Using our decision rule, we can define a \textbf{decision boundary} as
+the ``line'' that splits the data into classes based on its features.
+For logistic regression, the decision boundary is a \textbf{hyperplane}
+-- a linear combination of the features in p-dimensions -- and we can
+recover it from the final logistic regression model. For example, if we
+have a model with 2 features (2D), we have
+\(\theta = [\theta_0, \theta_1, \theta_2]\) including the intercept
+term, and we can solve for the decision boundary like so:
+
+\[
+\begin{align}
+T &= \frac{1}{1 + e^{\theta_0 + \theta_1 * \text{feature1} +  \theta_2 * \text{feature2}}} \\
+1 + e^{\theta_0 + \theta_1 \cdot \text{feature1} +  \theta_2  \cdot  \text{feature2}} &= \frac{1}{T} \\
+e^{\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2}} &= \frac{1}{T} - 1 \\
+\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2} &= \log(\frac{1}{T} - 1)
+\end{align} 
+\]
+
+For a model with 2 features, the decision boundary is a line in terms of
+its features. To make it easier to visualize, we've included an example
+of a 1-dimensional and a 2-dimensional decision boundary below. Notice
+how the decision boundary predicted by our logistic regression model
+perfectly separates the points into two classes.
+
+In real life, however, that is often not the case, and we often see some
+overlap between points of different classes across the decision
+boundary. The \emph{true} classes of the 2D data are shown below:
+
+As you can see, the decision boundary predicted by our logistic
+regression does not perfectly separate the two classes. There's a
+``muddled'' region near the decision boundary where our classifier
+predicts the wrong class. What would the data have to look like for the
+classifier to make perfect predictions?
+
+\hypertarget{linear-separability-and-regularization}{%
+\section{Linear Separability and
+Regularization}\label{linear-separability-and-regularization}}
+
+A classification dataset is said to be \textbf{linearly separable} if
+there exists a hyperplane among input features \(x\) that separates the
+two classes \(y\).
+
+Linear separability in 1D can be found with a rugplot of a single
+feature. For example, notice how the plot on the bottom left is linearly
+separable along the vertical line \(x=0\). However, no such line
+perfectly separates the two classes on the bottom right.
+
+This same definition holds in higher dimensions. If there are two
+features, the separating hyperplane must exist in two dimensions (any
+line of the form \(y=mx+b\)). We can visualize this using a scatter
+plot.
+
+This sounds great! When the dataset is linearly separable, a logistic
+regression classifier can perfectly assign datapoints into classes.
+However, (unexpected) complications may arise. Consider the \texttt{toy}
+dataset with 2 points and only a single feature \(x\):
+
+The optimal \(\theta\) value that minimizes loss pushes the predicted
+probabilities of the data points to their true class.
+
+\begin{itemize}
+\tightlist
+\item
+  \(P(Y = 1|x = -1) = \frac{1}{1 + e^\theta} \rightarrow 1\)
+\item
+  \(P(Y = 1|x = 1) = \frac{1}{1 + e^{-\theta}} \rightarrow 0\)
+\end{itemize}
+
+This happens when \(\theta = -\infty\). When \(\theta = -\infty\), we
+observe the following behavior for any input \(x\).
+
+\[P(Y=1|x) = \sigma(\theta x) \rightarrow \begin{cases}
+        1, \text{if }  x < 0\\
+        0, \text{if }  x \ge 0
+    \end{cases}\]
+
+The diverging weights cause the model to be overconfident. For example,
+consider the new point \((x, y) = (0.5, 1)\). Following the behavior
+above, our model will incorrectly predict \(p=0\), and thus,
+\(\hat y = 0\).
+
+The loss incurred by this misclassified point is infinite.
+
+\[-(y\text{ log}(p) + (1-y)\text{ log}(1-p))=1\text{log}(0)\]
+
+Thus, diverging weights (\(|\theta| \rightarrow \infty\)) occur with
+\textbf{lineary separable} data. ``Overconfidence'' is a particularly
+dangerous version of overfitting.
+
+Consider the loss function with respect to the parameter \(\theta\).
+
+Though it's very difficult to see, the plateau for negative values of
+\(\theta\) is slightly tilted downwards, meaning the loss approaches
+\(0\) as \(\theta\) decreases and approaches \(-\infty\).
+
+\hypertarget{regularized-logistic-regression}{%
+\subsection{Regularized Logistic
+Regression}\label{regularized-logistic-regression}}
+
+To avoid large weights and infinite loss (particularly on linearly
+separable data), we use regularization. The same principles apply as
+with linear regression - make sure to standardize your features first.
+
+For example, \(L2\) (Ridge) Logistic Regression takes on the form:
+
+\[\min_{\theta} -\frac{1}{n} \sum_{i=1}^{n} (y_i \text{log}(\sigma(x_i^T\theta)) + (1-y_i)\text{log}(1-\sigma(x_i^T\theta))) + \lambda \sum_{i=1}^{d} \theta_j^2\]
+
+Now, let us compare the loss functions of un-regularized and regularized
+logistic regression.
+
+As we can see, \(L2\) regularization helps us prevent diverging weights
+and deters against ``overconfidence.''
+
+\texttt{sklearn}'s logistic regression defaults to L2 regularization and
+\texttt{C=1.0}; \texttt{C} is the inverse of \(\lambda\):
+\(C = \frac{1}{\lambda}\). Setting \texttt{C} to a large value, for
+example, \texttt{C=300.0}, results in minimal regularization.
+
+\begin{verbatim}
+# sklearn defaults
+model = LogisticRegression(penalty='l2', C=1.0, …)
+model.fit()
+\end{verbatim}
+
+Note that in Data 100, we only use \texttt{sklearn} to fit logistic
+regression models. There is no closed-form solution to the optimal theta
+vector, and the gradient is a little messy (see the bonus section below
+for details).
+
+From here, the \texttt{.predict} function returns the predicted class
+\(\hat y\) of the point. In the simple binary case,
+
+\[\hat y = \begin{cases}
+        1, & P(Y=1|x) \ge 0.5\\
+        0, & \text{otherwise }
+    \end{cases}\]
+
+\hypertarget{performance-metrics}{%
+\section{Performance Metrics}\label{performance-metrics}}
+
+You might be thinking, if we've already introduced cross-entropy loss,
+why do we need additional ways of assessing how well our models perform?
+In linear regression, we made numerical predictions and used a loss
+function to determine how ``good'' these predictions were. In logistic
+regression, our ultimate goal is to classify data -- we are much more
+concerned with whether or not each datapoint was assigned the correct
+class using the decision rule. As such, we are interested in the
+\emph{quality} of classifications, not the predicted probabilities.
+
+The most basic evaluation metric is \textbf{accuracy}, that is, the
+proportion of correctly classified points.
+
+\[\text{accuracy} = \frac{\# \text{ of points classified correctly}}{\# \text{ of total points}}\]
+
+Translated to code:
+
+\begin{verbatim}
+def accuracy(X, Y):
+    return np.mean(model.predict(X) == Y)
+    
+model.score(X, y) # built-in accuracy function
+\end{verbatim}
+
+However, accuracy is not always a great metric for classification. To
+understand why, let's consider a classification problem with 100 emails
+where only 5 are truly spam, and the remaining 95 are truly ham. We'll
+investigate two models where accuracy is a poor metric.
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Model 1}: Our first model classifies every email as non-spam.
+  The model's accuracy is high (\(\frac{95}{100} = 0.95\)), but it
+  doesn't detect any spam emails. Despite the high accuracy, this is a
+  bad model.
+\item
+  \textbf{Model 2}: The second model classifies every email as spam. The
+  accuracy is low (\(\frac{5}{100} = 0.05\)), but the model correctly
+  labels every spam email. Unfortunately, it also misclassifies every
+  non-spam email.
+\end{itemize}
+
+As this example illustrates, accuracy is not always a good metric for
+classification, particularly when your data could exhibit class
+imbalance (e.g., very few 1's compared to 0's).
+
+\hypertarget{types-of-classification}{%
+\subsection{Types of Classification}\label{types-of-classification}}
+
+There are 4 different different classifications that our model might
+make:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{True positive}: correctly classify a positive point as being
+  positive (\(y=1\) and \(\hat{y}=1\))
+\item
+  \textbf{True negative}: correctly classify a negative point as being
+  negative (\(y=0\) and \(\hat{y}=0\))
+\item
+  \textbf{False positive}: incorrectly classify a negative point as
+  being positive (\(y=0\) and \(\hat{y}=1\))
+\item
+  \textbf{False negative}: incorrectly classify a positive point as
+  being negative (\(y=1\) and \(\hat{y}=0\))
+\end{enumerate}
+
+These classifications can be concisely summarized in a \textbf{confusion
+matrix}.
+
+An easy way to remember this terminology is as follows:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Look at the second word in the phrase. \emph{Positive} means a
+  prediction of 1. \emph{Negative} means a prediction of 0.
+\item
+  Look at the first word in the phrase. \emph{True} means our prediction
+  was correct. \emph{False} means it was incorrect.
+\end{enumerate}
+
+We can now write the accuracy calculation as
+\[\text{accuracy} = \frac{TP + TN}{n}\]
+
+In \texttt{sklearn}, we use the following syntax
+
+\begin{verbatim}
+from sklearn.metrics import confusion_matrix
+cm = confusion_matrix(Y_true, Y_pred)
+\end{verbatim}
+
+\hypertarget{accuracy-precision-and-recall}{%
+\subsection{Accuracy, Precision, and
+Recall}\label{accuracy-precision-and-recall}}
+
+The purpose of our discussion of the confusion matrix was to motivate
+better performance metrics for classification problems with class
+imbalance - namely, precision and recall.
+
+\textbf{Precision} is defined as
+
+\[\text{precision} = \frac{\text{TP}}{\text{TP + FP}}\]
+
+Precision answers the question: ``Of all observations that were
+predicted to be \(1\), what proportion was actually \(1\)?'' It measures
+how accurate the classifier is when its predictions are positive.
+
+\textbf{Recall} (or \textbf{sensitivity}) is defined as
+
+\[\text{recall} = \frac{\text{TP}}{\text{TP + FN}}\]
+
+Recall aims to answer: ``Of all observations that were actually \(1\),
+what proportion was predicted to be \(1\)?'' It measures how many
+positive predictions were missed.
+
+Here's a helpful graphic that summarizes our discussion above.
+
+\hypertarget{example-calculation-1}{%
+\subsection{Example Calculation}\label{example-calculation-1}}
+
+In this section, we will calculate the accuracy, precision, and recall
+performance metrics for our earlier spam classification example. As a
+reminder, we had 100 emails, 5 of which were spam. We designed two
+models:
+
+\begin{itemize}
+\tightlist
+\item
+  Model 1: Predict that every email is \emph{non-spam}
+\item
+  Model 2: Predict that every email is \emph{spam}
+\end{itemize}
+
+\hypertarget{model-1}{%
+\subsubsection{Model 1}\label{model-1}}
+
+First, let's begin by creating the confusion matrix.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3889}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+0
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+1
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & True Negative: 95 & False Positive: 0 \\
+1 & False Negative: 5 & True Positive: 0 \\
+\end{longtable}
+
+Convince yourself of why our confusion matrix looks like so.
+
+\[\text{accuracy} = \frac{95}{100} = 0.95\]
+\[\text{precision} = \frac{0}{0 + 0} = \text{undefined}\]
+\[\text{recall} = \frac{0}{0 + 5} = 0\]
+
+Notice how our precision is undefined because we never predicted class
+\(1\). Our recall is 0 for the same reason -- the numerator is 0 (we had
+no positive predictions).
+
+\hypertarget{model-2}{%
+\subsubsection{Model 2}\label{model-2}}
+
+Our confusion matrix for Model 2 looks like so.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3889}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+0
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+1
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & True Negative: 0 & False Positive: 95 \\
+1 & False Negative: 0 & True Positive: 5 \\
+\end{longtable}
+
+\[\text{accuracy} = \frac{5}{100} = 0.05\]
+\[\text{precision} = \frac{5}{5 + 95} = 0.05\]
+\[\text{recall} = \frac{5}{5 + 0} = 1\]
+
+Our precision is low because we have many false positives, and our
+recall is perfect - we correctly classified all spam emails (we never
+predicted class \(0\)).
+
+\hypertarget{precision-vs.-recall}{%
+\subsection{Precision vs.~Recall}\label{precision-vs.-recall}}
+
+Precision (\(\frac{\text{TP}}{\text{TP} + \textbf{ FP}}\)) penalizes
+false positives, while recall
+(\(\frac{\text{TP}}{\text{TP} + \textbf{ FN}}\)) penalizes false
+negatives.
+
+In fact, precision and recall are \emph{inversely related}. This is
+evident in our second model -- we observed a high recall and low
+precision. Usually, there is a tradeoff in these two (most models can
+either minimize the number of FP or FN; and in rare cases, both).
+
+The specific performance metric(s) to prioritize depends on the context.
+In many medical settings, there might be a much higher cost to missing
+positive cases. For instance, in our breast cancer example, it is more
+costly to misclassify malignant tumors (false negatives) than it is to
+incorrectly classify a benign tumor as malignant (false positives). In
+the case of the latter, pathologists can conduct further studies to
+verify malignant tumors. As such, we should minimize the number of false
+negatives. This is equivalent to maximizing recall.
+
+\hypertarget{two-more-metrics}{%
+\subsection{Two More Metrics}\label{two-more-metrics}}
+
+The \textbf{True Positive Rate (TPR)} is defined as
+
+\[\text{true positive rate} = \frac{\text{TP}}{\text{TP + FN}}\]
+
+You'll notice this is equivalent to \emph{recall}. In the context of our
+spam email classifier, it answers the question: ``What proportion of
+spam did I mark correctly?''. We'd like this to be close to \(1\)
+
+The \textbf{False Positive Rate (FPR)} is defined as
+
+\[\text{false positive rate} = \frac{\text{FP}}{\text{FP + TN}}\]
+
+Another word for FPR is \emph{specificity}. This answers the question:
+``What proportion of regular email did I mark as spam?''. We'd like this
+to be close to \(0\)
+
+As we increase threshold \(T\), both TPR and FPR decrease. We've plotted
+this relationship below for some model on a \texttt{toy} dataset.
+
+\hypertarget{adjusting-the-classification-threshold}{%
+\section{Adjusting the Classification
+Threshold}\label{adjusting-the-classification-threshold}}
+
+One way to minimize the number of FP vs.~FN (equivalently, maximizing
+precision vs.~recall) is by adjusting the classification threshold
+\(T\).
+
+\[\hat y = \begin{cases}
+        1, & P(Y=1|x) \ge T\\
+        0, & \text{otherwise }
+    \end{cases}\]
+
+The default threshold in \texttt{sklearn} is \(T = 0.5\). As we increase
+the threshold \(T\), we ``raise the standard'' of how confident our
+classifier needs to be to predict 1 (i.e., ``positive'').
+
+As you may notice, the choice of threshold \(T\) impacts our
+classifier's performance.
+
+\begin{itemize}
+\tightlist
+\item
+  High \(T\): Most predictions are \(0\).
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Lots of false negatives
+  \item
+    Fewer false positives
+  \end{itemize}
+\item
+  Low \(T\): Most predictions are \(1\).
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Lots of false positives
+  \item
+    Fewer false negatives
+  \end{itemize}
+\end{itemize}
+
+In fact, we can choose a threshold \(T\) based on our desired number, or
+proportion, of false positives and false negatives. We can do so using a
+few different tools. We'll touch on two of the most important ones in
+Data 100.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Precision-Recall Curve (PR Curve)
+\item
+  ``Receiver Operating Characteristic'' Curve (ROC Curve)
+\end{enumerate}
+
+\hypertarget{precision-recall-curves}{%
+\subsection{Precision-Recall Curves}\label{precision-recall-curves}}
+
+A \textbf{Precision-Recall Curve (PR Curve)} is an alternative to the
+ROC curve that displays the relationship between precision and recall
+for various threshold values. It is constructed in a similar way as with
+the ROC curve.
+
+Let's first consider how precision and recall change as a function of
+the threshold \(T\). We know this quite well from earlier -- precision
+will generally increase, and recall will decrease.
+
+Displayed below is the PR Curve for the same \texttt{toy} dataset.
+Notice how threshold values increase as we move to the left.
+
+Once again, the perfect classifier will resemble the orange curve, this
+time, facing the opposite direction.
+
+We want our PR curve to be as close to the ``top right'' of this graph
+as possible. Again, we use the AUC to determine ``closeness'', with the
+perfect classifier exhibiting an AUC = 1 (and the worst with an AUC =
+0.5).
+
+\hypertarget{the-roc-curve}{%
+\subsection{The ROC Curve}\label{the-roc-curve}}
+
+The ``Receiver Operating Characteristic'' Curve (\textbf{ROC Curve})
+plots the tradeoff between FPR and TPR. Notice how the far-left of the
+curve corresponds to higher threshold \(T\) values.
+
+The ``perfect'' classifier is the one that has a TPR of 1, and FPR of 0.
+This is achieved at the top-left of the plot below. More generally, it's
+ROC curve resembles the curve in orange.
+
+We want our model to be as close to this orange curve as possible. How
+do we quantify ``closeness''?
+
+We can compute the \textbf{area under curve (AUC)} of the ROC curve.
+Notice how the perfect classifier has an AUC = 1. The closer our model's
+AUC is to 1, the better it is.
+
+\hypertarget{extra-what-is-the-worst-auc-and-why-is-it-0.5}{%
+\subsubsection{{[}Extra{]} What is the ``worst'' AUC, and why is it
+0.5?}\label{extra-what-is-the-worst-auc-and-why-is-it-0.5}}
+
+On the other hand, a terrible model will have an AUC closer to 0.5.
+Random predictors randomly predict \(P(Y = 1 | x)\) to be uniformly
+between 0 and 1. This indicates the classifier is not able to
+distinguish between positive and negative classes, and thus, randomly
+predicts one of the two.
+
+\hypertarget{extra-gradient-descent-for-logistic-regression}{%
+\section{{[}Extra{]} Gradient Descent for Logistic
+Regression}\label{extra-gradient-descent-for-logistic-regression}}
+
+Let's define the following: \[
+t_i = \phi(x_i)^T \theta \\
+p_i = \sigma(t_i) \\
+t_i = \log(\frac{p_i}{1 - p_i}) \\
+1 - \sigma(t_i) = \sigma(-t_i) \\
+\frac{d}{dt}  \sigma(t) =  \sigma(t) \sigma(-t)
+\]
+
+Now, we can simplify the cross-entropy loss \[
+\begin{align}
+y_i \log(p_i) + (1 - y_i) \log(1 - p_i) &= y_i \log(\frac{p_i}{1 - p_i}) + \log(1 - p_i) \\
+&= y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta))
+\end{align}
+\]
+
+Hence, the optimal \(\hat{\theta}\) is
+\[\text{argmin}_{\theta} - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]
+
+We want to minimize
+\[L(\theta) = - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]
+
+So we take the derivative \[ 
+\begin{align}
+\triangledown_{\theta} L(\theta) &= - \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} y_i \phi(x_i)^T + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{1}{\sigma(-\phi(x_i)^T \theta)} \triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{\sigma(-\phi(x_i)^T \theta)}{\sigma(-\phi(x_i)^T \theta)} \sigma(\phi(x_i)^T \theta)\triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&= - \frac{1}{n} \sum_{i=1}^n (y_i - \sigma(\phi(x_i)^T \theta)\phi(x_i))
+\end{align}
+\]
+
+Setting the derivative equal to 0 and solving for \(\hat{\theta}\), we
+find that there's no general analytic solution. Therefore, we must solve
+using numeric methods.
+
+\hypertarget{gradient-descent-update-rule}{%
+\subsection{Gradient Descent Update
+Rule}\label{gradient-descent-update-rule}}
+
+\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]
+
+For \(\tau\) from 0 to convergence:
+\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} + \rho(\tau)\left( \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]
+
+\hypertarget{stochastic-gradient-descent-update-rule}{%
+\subsection{Stochastic Gradient Descent Update
+Rule}\label{stochastic-gradient-descent-update-rule}}
+
+\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]
+
+For \(\tau\) from 0 to convergence, let \(B\) \textasciitilde{}
+\(\text{Random subset of indices}\).
+\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} + \rho(\tau)\left( \frac{1}{|B|} \sum_{i \in B} \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]
+
 
 
 \end{document}
diff --git a/logistic_regression_2/logistic_reg_2.ipynb b/logistic_regression_2/logistic_reg_2.ipynb
deleted file mode 100644
index 62eb6b9d..00000000
--- a/logistic_regression_2/logistic_reg_2.ipynb
+++ /dev/null
@@ -1,473 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "raw",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "title: Logistic Regression II\n",
-    "format:\n",
-    "  html:\n",
-    "    toc: true\n",
-    "    toc-depth: 5\n",
-    "    toc-location: right\n",
-    "    code-fold: false\n",
-    "    theme:\n",
-    "      - cosmo\n",
-    "      - cerulean\n",
-    "    callout-icon: false\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "::: {.callout-note collapse=\\\"false\\\"}\n",
-    "## Learning Outcomes\n",
-    "* Apply decision rules to make a classification\n",
-    "* Learn when logistic regression works well and when it does not\n",
-    "* Introduce new metrics for model performance\n",
-    "::: \n",
-    "\n",
-    "Today, we will continue studying the Logistic Regression model. We'll discuss decision boundaries that help inform the classification of a particular prediction. Then, we'll pick up from last lecture's discussion of cross-entropy loss, study a few of its pitfalls, and learn potential remedies. We will also provide an implementation of `sklearn`'s logistic regression model. Lastly, we'll return to decision rules and discuss metrics that allow us to determine our model's performance in different scenarios. \n",
-    "\n",
-    "This will introduce us to the process of **thresholding** -- a technique used to *classify* data from our model's predicted probabilities, or $P(Y=1|x)$. In doing so, we'll focus on how these thresholding decisions affect the behavior of our model. We will learn various evaluation metrics useful for binary classification, and apply them to our study of logistic regression.\n",
-    "\n",
-    "<center><img src=\"images/log_reg_summary.png\" alt='tpr_fpr' width='800'></center>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Decision Boundaries\n",
-    "In logistic regression, we model the *probability* that a datapoint belongs to Class 1. Last week, we developed the logistic regression model to predict that probability, but we never actually made any *classifications* for whether our prediction $y$ belongs in Class 0 or Class 1. \n",
-    "\n",
-    "$$ p = P(Y=1 | x) = \\frac{1}{1 + e^{-x^T\\theta}}$$\n",
-    "\n",
-    "A **decision rule** tells us how to interpret the output of the model to make a decision on how to classify a datapoint. We commonly make decision rules by specifying a **threshold**, $T$. If the predicted probability is greater than or equal to $T$, predict Class 1. Otherwise, predict Class 0. \n",
-    "\n",
-    "$$\\hat y = \\text{classify}(x) = \\begin{cases}\n",
-    "        1, & P(Y=1|x) \\ge T\\\\\n",
-    "        0, & \\text{otherwise }\n",
-    "    \\end{cases}$$\n",
-    "    \n",
-    "The threshold is often set to $T = 0.5$, but *not always*. We'll discuss why we might want to use other thresholds  $T \\neq 0.5$ later in this lecture.\n",
-    "\n",
-    "Using our decision rule, we can define a **decision boundary** as the “line” that splits the data into classes based on its features. For logistic regression, the decision boundary is a **hyperplane** -- a linear combination of the features in p-dimensions -- and we can recover it from the final logistic regression model. For example, if we have a model with 2 features (2D), we have $\\theta = [\\theta_0, \\theta_1, \\theta_2]$ including the intercept term, and we can solve for the decision boundary like so: \n",
-    "\n",
-    "$$\n",
-    "\\begin{align}\n",
-    "T &= \\frac{1}{1 + e^{\\theta_0 + \\theta_1 * \\text{feature1} +  \\theta_2 * \\text{feature2}}} \\\\\n",
-    "1 + e^{\\theta_0 + \\theta_1 \\cdot \\text{feature1} +  \\theta_2  \\cdot  \\text{feature2}} &= \\frac{1}{T} \\\\\n",
-    "e^{\\theta_0 + \\theta_1  \\cdot  \\text{feature1} +  \\theta_2  \\cdot  \\text{feature2}} &= \\frac{1}{T} - 1 \\\\\n",
-    "\\theta_0 + \\theta_1  \\cdot  \\text{feature1} +  \\theta_2  \\cdot  \\text{feature2} &= \\log(\\frac{1}{T} - 1)\n",
-    "\\end{align} \n",
-    "$$\n",
-    "\n",
-    "For a model with 2 features, the decision boundary is a line in terms of its features. To make it easier to visualize, we've included an example of a 1-dimensional and a 2-dimensional decision boundary below. Notice how the decision boundary predicted by our logistic regression model perfectly separates the points into two classes. \n",
-    "\n",
-    "<center><img src=\"images/decision_boundary.png\" alt='varying_threshold' width='800'></center>\n",
-    "\n",
-    "In real life, however, that is often not the case, and we often see some overlap between points of different classes across the decision boundary. The *true* classes of the 2D data are shown below: \n",
-    "\n",
-    "<center><img src=\"images/decision_boundary_true.png\" alt='varying_threshold' width='400'></center>\n",
-    "\n",
-    "As you can see, the decision boundary predicted by our logistic regression does not perfectly separate the two classes. There's a “muddled” region near the decision boundary where our classifier predicts the wrong class. What would the data have to look like for the classifier to make perfect predictions?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Linear Separability and Regularization\n",
-    "\n",
-    "A classification dataset is said to be **linearly separable** if there exists a hyperplane among input features $x$ that separates the two classes $y$. \n",
-    "\n",
-    "Linear separability in 1D can be found with a rugplot of a single feature. For example, notice how the plot on the bottom left is linearly separable along the vertical line $x=0$. However, no such line perfectly separates the two classes on the bottom right.\n",
-    "\n",
-    "<center><img src=\"images/linear_separability_1D.png\" alt='linear_separability_1D' width='800'></center>\n",
-    "\n",
-    "This same definition holds in higher dimensions. If there are two features, the separating hyperplane must exist in two dimensions (any line of the form $y=mx+b$). We can visualize this using a scatter plot.\n",
-    "\n",
-    "<center><img src=\"images/linear_separability_2D.png\" alt='linear_separability_1D' width='800'></center>\n",
-    "\n",
-    "This sounds great! When the dataset is linearly separable, a logistic regression classifier can perfectly assign datapoints into classes. However, (unexpected) complications may arise. Consider the `toy` dataset with 2 points and only a single feature $x$:\n",
-    "\n",
-    "<center><img src=\"images/toy_2_point.png\" alt='toy_linear_separability' width='500'></center>\n",
-    "\n",
-    "The optimal $\\theta$ value that minimizes loss pushes the predicted probabilities of the data points to their true class.\n",
-    "\n",
-    "- $P(Y = 1|x = -1) = \\frac{1}{1 + e^\\theta} \\rightarrow 1$\n",
-    "- $P(Y = 1|x = 1) = \\frac{1}{1 + e^{-\\theta}} \\rightarrow 0$\n",
-    "\n",
-    "This happens when $\\theta = -\\infty$. When $\\theta = -\\infty$, we observe the following behavior for any input $x$.\n",
-    "\n",
-    "$$P(Y=1|x) = \\sigma(\\theta x) \\rightarrow \\begin{cases}\n",
-    "        1, \\text{if }  x < 0\\\\\n",
-    "        0, \\text{if }  x \\ge 0\n",
-    "    \\end{cases}$$\n",
-    "\n",
-    "The diverging weights cause the model to be overconfident. For example, consider the new point $(x, y) = (0.5, 1)$. Following the behavior above, our model will incorrectly predict $p=0$, and thus, $\\hat y = 0$.\n",
-    "\n",
-    "<center><img src=\"images/toy_3_point.png\" alt='toy_linear_separability' width='500'></center>\n",
-    "\n",
-    "The loss incurred by this misclassified point is infinite.\n",
-    "\n",
-    "$$-(y\\text{ log}(p) + (1-y)\\text{ log}(1-p))=1\\text{log}(0)$$\n",
-    "\n",
-    "Thus, diverging weights ($|\\theta| \\rightarrow \\infty$) occur with **lineary separable** data. \"Overconfidence\" is a particularly dangerous version of overfitting.\n",
-    "\n",
-    "Consider the loss function with respect to the parameter $\\theta$.\n",
-    "\n",
-    "<center><img src=\"images/unreg_loss.png\" alt='unreg_loss' width='500'></center>\n",
-    "\n",
-    "Though it's very difficult to see, the plateau for negative values of $\\theta$ is slightly tilted downwards, meaning the loss approaches $0$ as $\\theta$ decreases and approaches $-\\infty$.\n",
-    "\n",
-    "### Regularized Logistic Regression\n",
-    "\n",
-    "To avoid large weights and infinite loss (particularly on linearly separable data), we use regularization. The same principles apply as with linear regression - make sure to standardize your features first.\n",
-    "\n",
-    "For example, $L2$ (Ridge) Logistic Regression takes on the form:\n",
-    "\n",
-    "$$\\min_{\\theta} -\\frac{1}{n} \\sum_{i=1}^{n} (y_i \\text{log}(\\sigma(x_i^T\\theta)) + (1-y_i)\\text{log}(1-\\sigma(x_i^T\\theta))) + \\lambda \\sum_{i=1}^{d} \\theta_j^2$$\n",
-    "\n",
-    "Now, let us compare the loss functions of un-regularized and regularized logistic regression.\n",
-    "\n",
-    "<center><img src=\"images/unreg_loss.png\" alt='unreg_loss' width='500'></center>\n",
-    "\n",
-    "<center><img src=\"images/reg_loss.png\" alt='reg_loss' width='500'></center>\n",
-    "\n",
-    "As we can see, $L2$ regularization helps us prevent diverging weights and deters against \"overconfidence.\"\n",
-    "\n",
-    "`sklearn`'s logistic regression defaults to L2 regularization and `C=1.0`; `C` is the inverse of $\\lambda$: $C = \\frac{1}{\\lambda}$. Setting `C` to a large value, for example, `C=300.0`, results in minimal regularization.\n",
-    "\n",
-    "    # sklearn defaults\n",
-    "    model = LogisticRegression(penalty='l2', C=1.0, …)\n",
-    "    model.fit()\n",
-    "\n",
-    "Note that in Data 100, we only use `sklearn` to fit logistic regression models. There is no closed-form solution to the optimal theta vector, and the gradient is a little messy (see the bonus section below for details).\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "From here, the `.predict` function returns the predicted class $\\hat y$ of the point. In the simple binary case, \n",
-    "\n",
-    "$$\\hat y = \\begin{cases}\n",
-    "        1, & P(Y=1|x) \\ge 0.5\\\\\n",
-    "        0, & \\text{otherwise }\n",
-    "    \\end{cases}$$"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Performance Metrics\n",
-    "You might be thinking, if we've already introduced cross-entropy loss, why do we need additional ways of assessing how well our models perform? In linear regression, we made numerical predictions and used a loss function to determine how “good” these predictions were. In logistic regression, our ultimate goal is to classify data – we are much more concerned with whether or not each datapoint was assigned the correct class using the decision rule. As such, we are interested in the *quality* of classifications, not the predicted probabilities.\n",
-    "\n",
-    "The most basic evaluation metric is **accuracy**, that is, the proportion of correctly classified points.\n",
-    "\n",
-    "$$\\text{accuracy} = \\frac{\\# \\text{ of points classified correctly}}{\\# \\text{ of total points}}$$\n",
-    "\n",
-    "Translated to code: \n",
-    "\n",
-    "    def accuracy(X, Y):\n",
-    "        return np.mean(model.predict(X) == Y)\n",
-    "        \n",
-    "    model.score(X, y) # built-in accuracy function\n",
-    "\n",
-    "However, accuracy is not always a great metric for classification. To understand why, let's consider a classification problem with 100 emails where only 5 are truly spam, and the remaining 95 are truly ham. We'll investigate two models where accuracy is a poor metric. \n",
-    "\n",
-    "- **Model 1**: Our first model classifies every email as non-spam. The model's accuracy is high ($\\frac{95}{100} = 0.95$), but it doesn't detect any spam emails. Despite the high accuracy, this is a bad model.\n",
-    "- **Model 2**: The second model classifies every email as spam. The accuracy is low ($\\frac{5}{100} = 0.05$), but the model correctly labels every spam email. Unfortunately, it also misclassifies every non-spam email.\n",
-    "\n",
-    "As this example illustrates, accuracy is not always a good metric for classification, particularly when your data could exhibit class imbalance (e.g., very few 1’s compared to 0’s).\n",
-    "\n",
-    "### Types of Classification\n",
-    "There are 4 different different classifications that our model might make:\n",
-    "\n",
-    "1. **True positive**: correctly classify a positive point as being positive ($y=1$ and $\\hat{y}=1$)\n",
-    "2. **True negative**: correctly classify a negative point as being negative ($y=0$ and $\\hat{y}=0$)\n",
-    "3. **False positive**: incorrectly classify a negative point as being positive ($y=0$ and $\\hat{y}=1$)\n",
-    "4. **False negative**: incorrectly classify a positive point as being negative ($y=1$ and $\\hat{y}=0$)\n",
-    "\n",
-    "These classifications can be concisely summarized in a **confusion matrix**. \n",
-    "\n",
-    "<center><img src=\"images/confusion_matrix.png\" alt='confusion_matrix' width='500'></center>\n",
-    "\n",
-    "An easy way to remember this terminology is as follows:\n",
-    "\n",
-    "1. Look at the second word in the phrase. *Positive* means a prediction of 1. *Negative* means a prediction of 0.\n",
-    "2. Look at the first word in the phrase. *True* means our prediction was correct. *False* means it was incorrect.\n",
-    "\n",
-    "We can now write the accuracy calculation as \n",
-    "$$\\text{accuracy} = \\frac{TP + TN}{n}$$\n",
-    "\n",
-    "In `sklearn`, we use the following syntax\n",
-    "\n",
-    "    from sklearn.metrics import confusion_matrix\n",
-    "    cm = confusion_matrix(Y_true, Y_pred)\n",
-    "\n",
-    "<center><img src=\"images/confusion_matrix_sklearn.png\" alt='confusion_matrix' width='300'></center>\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Accuracy, Precision, and Recall\n",
-    "\n",
-    "The purpose of our discussion of the confusion matrix was to motivate better performance metrics for classification problems with class imbalance - namely, precision and recall.\n",
-    "\n",
-    "**Precision** is defined as\n",
-    "\n",
-    "$$\\text{precision} = \\frac{\\text{TP}}{\\text{TP + FP}}$$\n",
-    "\n",
-    "Precision answers the question: \"Of all observations that were predicted to be $1$, what proportion was actually $1$?\" It measures how accurate the classifier is when its predictions are positive.\n",
-    "\n",
-    "**Recall** (or **sensitivity**) is defined as \n",
-    "\n",
-    "$$\\text{recall} = \\frac{\\text{TP}}{\\text{TP + FN}}$$\n",
-    "\n",
-    "Recall aims to answer: \"Of all observations that were actually $1$, what proportion was predicted to be $1$?\" It measures how many positive predictions were missed.\n",
-    "\n",
-    "Here's a helpful graphic that summarizes our discussion above.\n",
-    "\n",
-    "<center><img src=\"images/precision_recall_graphic.png\" alt='confusion_matrix' width='700'></center>\n",
-    "\n",
-    "### Example Calculation\n",
-    "\n",
-    "In this section, we will calculate the accuracy, precision, and recall performance metrics for our earlier spam classification example. As a reminder, we had 100 emails, 5 of which were spam. We designed two models:\n",
-    "\n",
-    "- Model 1: Predict that every email is *non-spam*\n",
-    "- Model 2: Predict that every email is *spam*\n",
-    "\n",
-    "#### Model 1\n",
-    "\n",
-    "First, let's begin by creating the confusion matrix.\n",
-    "\n",
-    "+-------------------+-------------------+---------------------------+\n",
-    "|                   | 0                 | 1                         |\n",
-    "+===================+===================+===========================+\n",
-    "|  0                | True Negative: 95 | False Positive: 0         |\n",
-    "+-------------------+-------------------+---------------------------+\n",
-    "|  1                | False Negative: 5 | True Positive: 0          |\n",
-    "+-------------------+-------------------+---------------------------+\n",
-    "\n",
-    "Convince yourself of why our confusion matrix looks like so.\n",
-    "\n",
-    "$$\\text{accuracy} = \\frac{95}{100} = 0.95$$\n",
-    "$$\\text{precision} = \\frac{0}{0 + 0} = \\text{undefined}$$\n",
-    "$$\\text{recall} = \\frac{0}{0 + 5} = 0$$\n",
-    "\n",
-    "Notice how our precision is undefined because we never predicted class $1$. Our recall is 0 for the same reason -- the numerator is 0 (we had no positive predictions).\n",
-    "\n",
-    "#### Model 2\n",
-    "\n",
-    "Our confusion matrix for Model 2 looks like so.\n",
-    "\n",
-    "+-------------------+-------------------+---------------------------+\n",
-    "|                   | 0                 | 1                         |\n",
-    "+===================+===================+===========================+\n",
-    "|  0                | True Negative: 0  | False Positive: 95        |\n",
-    "+-------------------+-------------------+---------------------------+\n",
-    "|  1                | False Negative: 0 | True Positive: 5          |\n",
-    "+-------------------+-------------------+---------------------------+\n",
-    "\n",
-    "$$\\text{accuracy} = \\frac{5}{100} = 0.05$$\n",
-    "$$\\text{precision} = \\frac{5}{5 + 95} = 0.05$$\n",
-    "$$\\text{recall} = \\frac{5}{5 + 0} = 1$$\n",
-    "\n",
-    "Our precision is low because we have many false positives, and our recall is perfect - we correctly classified all spam emails (we never predicted class $0$).\n",
-    "\n",
-    "### Precision vs. Recall\n",
-    "\n",
-    "Precision ($\\frac{\\text{TP}}{\\text{TP} + \\textbf{ FP}}$) penalizes false positives, while recall ($\\frac{\\text{TP}}{\\text{TP} + \\textbf{ FN}}$) penalizes false negatives.\n",
-    "\n",
-    "In fact, precision and recall are *inversely related*. This is evident in our second model -- we observed a high recall and low precision. Usually, there is a tradeoff in these two (most models can either minimize the number of FP or FN; and in rare cases, both). \n",
-    "\n",
-    "The specific performance metric(s) to prioritize depends on the context. In many medical settings, there might be a much higher cost to missing positive cases. For instance, in our breast cancer example, it is more costly to misclassify malignant tumors (false negatives) than it is to incorrectly classify a benign tumor as malignant (false positives). In the case of the latter, pathologists can conduct further studies to verify malignant tumors. As such, we should minimize the number of false negatives. This is equivalent to maximizing recall.\n",
-    "\n",
-    "\n",
-    "\n",
-    "### Two More Metrics\n",
-    "\n",
-    "The **True Positive Rate (TPR)** is defined as\n",
-    "\n",
-    "$$\\text{true positive rate} = \\frac{\\text{TP}}{\\text{TP + FN}}$$\n",
-    "\n",
-    "You'll notice this is equivalent to *recall*. In the context of our spam email classifier, it answers the question: \"What proportion of spam did I mark correctly?\". We'd like this to be close to $1$\n",
-    "\n",
-    "The **False Positive Rate (FPR)** is defined as\n",
-    "\n",
-    "$$\\text{false positive rate} = \\frac{\\text{FP}}{\\text{FP + TN}}$$\n",
-    "\n",
-    "Another word for FPR is *specificity*. This answers the question: \"What proportion of regular email did I mark as spam?\". We'd like this to be close to $0$\n",
-    "\n",
-    "As we increase threshold $T$, both TPR and FPR decrease. We've plotted this relationship below for some model on a `toy` dataset.\n",
-    "\n",
-    "<center><img src=\"images/tpr_fpr.png\" alt='tpr_fpr' width='800'></center>\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Adjusting the Classification Threshold\n",
-    "\n",
-    "One way to minimize the number of FP vs. FN (equivalently, maximizing precision vs. recall) is by adjusting the classification threshold $T$.\n",
-    "\n",
-    "$$\\hat y = \\begin{cases}\n",
-    "        1, & P(Y=1|x) \\ge T\\\\\n",
-    "        0, & \\text{otherwise }\n",
-    "    \\end{cases}$$\n",
-    "    \n",
-    "The default threshold in `sklearn` is $T = 0.5$. As we increase the threshold $T$, we “raise the standard” of how confident our classifier needs to be to predict 1 (i.e., “positive”).\n",
-    "\n",
-    "<center><img src=\"images/varying_threshold.png\" alt='varying_threshold' width='800'></center>\n",
-    "\n",
-    "As you may notice, the choice of threshold $T$ impacts our classifier's performance.\n",
-    "\n",
-    "- High $T$: Most predictions are $0$. \n",
-    "    - Lots of false negatives\n",
-    "    - Fewer false positives\n",
-    "- Low $T$: Most predictions are $1$. \n",
-    "    - Lots of false positives \n",
-    "    - Fewer false negatives\n",
-    "\n",
-    "In fact, we can choose a threshold $T$ based on our desired number, or proportion, of false positives and false negatives. We can do so using a few different tools. We'll touch on two of the most important ones in Data 100.\n",
-    "\n",
-    "1. Precision-Recall Curve (PR Curve)\n",
-    "2. \"Receiver Operating Characteristic\" Curve (ROC Curve)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "### Precision-Recall Curves\n",
-    "\n",
-    "A **Precision-Recall Curve (PR Curve)** is an alternative to the ROC curve that displays the relationship between precision and recall for various threshold values. It is constructed in a similar way as with the ROC curve.\n",
-    "\n",
-    "Let's first consider how precision and recall change as a function of the threshold $T$. We know this quite well from earlier -- precision will generally increase, and recall will decrease.\n",
-    "\n",
-    "<center><img src=\"images/precision-recall-thresh.png\" alt='precision-recall-thresh' width='750'></center>\n",
-    "\n",
-    "Displayed below is the PR Curve for the same `toy` dataset. Notice how threshold values increase as we move to the left.\n",
-    "\n",
-    "<center><img src=\"images/pr_curve_thresholds.png\" alt='pr_curve_thresholds' width='685'></center>\n",
-    "\n",
-    "Once again, the perfect classifier will resemble the orange curve, this time, facing the opposite direction.\n",
-    "\n",
-    "<center><img src=\"images/pr_curve_perfect.png\" alt='pr_curve_perfect' width='675'></center>\n",
-    "\n",
-    "We want our PR curve to be as close to the “top right” of this graph as possible. Again, we use the AUC to determine \"closeness\", with the perfect classifier exhibiting an AUC = 1 (and the worst with an AUC = 0.5)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### The ROC Curve\n",
-    "\n",
-    "The “Receiver Operating Characteristic” Curve (**ROC Curve**) plots the tradeoff between FPR and TPR. Notice how the far-left of the curve corresponds to higher threshold $T$ values.\n",
-    "\n",
-    "<center><img src=\"images/roc_curve.png\" alt='roc_curve' width='700'></center>\n",
-    "\n",
-    "The “perfect” classifier is the one that has a TPR of 1, and FPR of 0. This is achieved at the top-left of the plot below. More generally, it's ROC curve resembles the curve in orange.\n",
-    "\n",
-    "<center><img src=\"images/roc_curve_perfect.png\" alt='roc_curve_perfect' width='700'></center>\n",
-    "\n",
-    "We want our model to be as close to this orange curve as possible. How do we quantify \"closeness\"?\n",
-    "\n",
-    "We can compute the **area under curve (AUC)** of the ROC curve. Notice how the perfect classifier has an AUC = 1. The closer our model's AUC is to 1, the better it is. \n",
-    "\n",
-    "\n",
-    "#### [Extra] What is the “worst” AUC, and why is it 0.5? \n",
-    "On the other hand, a terrible model will have an AUC closer to 0.5. Random predictors randomly predict $P(Y = 1 | x)$ to be uniformly between 0 and 1. This indicates the classifier is not able to distinguish between positive and negative classes, and thus, randomly predicts one of the two.\n",
-    "\n",
-    "<center><img src=\"images/roc_curve_worst_predictor.png\" alt='roc_curve_worst_predictor' width='900'></center>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## [Extra] Gradient Descent for Logistic Regression\n",
-    "Let's define the following: \n",
-    "$$\n",
-    "t_i = \\phi(x_i)^T \\theta \\\\\n",
-    "p_i = \\sigma(t_i) \\\\\n",
-    "t_i = \\log(\\frac{p_i}{1 - p_i}) \\\\\n",
-    "1 - \\sigma(t_i) = \\sigma(-t_i) \\\\\n",
-    "\\frac{d}{dt}  \\sigma(t) =  \\sigma(t) \\sigma(-t)\n",
-    "$$\n",
-    "\n",
-    "Now, we can simplify the cross-entropy loss\n",
-    "$$\n",
-    "\\begin{align}\n",
-    "y_i \\log(p_i) + (1 - y_i) \\log(1 - p_i) &= y_i \\log(\\frac{p_i}{1 - p_i}) + \\log(1 - p_i) \\\\\n",
-    "&= y_i \\phi(x_i)^T + \\log(\\sigma(-\\phi(x_i)^T \\theta))\n",
-    "\\end{align}\n",
-    "$$\n",
-    "\n",
-    "Hence, the optimal $\\hat{\\theta}$ is \n",
-    "$$\\text{argmin}_{\\theta} - \\frac{1}{n} \\sum_{i=1}^n (y_i \\phi(x_i)^T + \\log(\\sigma(-\\phi(x_i)^T \\theta)))$$ \n",
-    "\n",
-    "We want to minimize $$L(\\theta) = - \\frac{1}{n} \\sum_{i=1}^n (y_i \\phi(x_i)^T + \\log(\\sigma(-\\phi(x_i)^T \\theta)))$$\n",
-    "\n",
-    "So we take the derivative \n",
-    "$$ \n",
-    "\\begin{align}\n",
-    "\\triangledown_{\\theta} L(\\theta) &= - \\frac{1}{n} \\sum_{i=1}^n \\triangledown_{\\theta} y_i \\phi(x_i)^T + \\triangledown_{\\theta} \\log(\\sigma(-\\phi(x_i)^T \\theta)) \\\\\n",
-    "&= - \\frac{1}{n} \\sum_{i=1}^n y_i \\phi(x_i) + \\triangledown_{\\theta} \\log(\\sigma(-\\phi(x_i)^T \\theta)) \\\\\n",
-    "&= - \\frac{1}{n} \\sum_{i=1}^n y_i \\phi(x_i) + \\frac{1}{\\sigma(-\\phi(x_i)^T \\theta)} \\triangledown_{\\theta} \\sigma(-\\phi(x_i)^T \\theta) \\\\\n",
-    "&= - \\frac{1}{n} \\sum_{i=1}^n y_i \\phi(x_i) + \\frac{\\sigma(-\\phi(x_i)^T \\theta)}{\\sigma(-\\phi(x_i)^T \\theta)} \\sigma(\\phi(x_i)^T \\theta)\\triangledown_{\\theta} \\sigma(-\\phi(x_i)^T \\theta) \\\\\n",
-    "&= - \\frac{1}{n} \\sum_{i=1}^n (y_i - \\sigma(\\phi(x_i)^T \\theta)\\phi(x_i))\n",
-    "\\end{align}\n",
-    "$$\n",
-    "\n",
-    "Setting the derivative equal to 0 and solving for $\\hat{\\theta}$, we find that there's no general analytic solution. Therefore, we must solve using numeric methods. \n",
-    "\n",
-    "### Gradient Descent Update Rule\n",
-    "$$\\theta^{(0)} \\leftarrow \\text{initial vector (random, zeros, ...)} $$\n",
-    "\n",
-    "For $\\tau$ from 0 to convergence: \n",
-    "$$ \\theta^{(\\tau + 1)} \\leftarrow \\theta^{(\\tau)} + \\rho(\\tau)\\left( \\frac{1}{n} \\sum_{i=1}^n \\triangledown_{\\theta} L_i(\\theta) \\mid_{\\theta = \\theta^{(\\tau)}}\\right) $$\n",
-    "\n",
-    "### Stochastic Gradient Descent Update Rule\n",
-    "$$\\theta^{(0)} \\leftarrow \\text{initial vector (random, zeros, ...)} $$\n",
-    "\n",
-    "For $\\tau$ from 0 to convergence, let $B$ ~ $\\text{Random subset of indices}$. \n",
-    "$$ \\theta^{(\\tau + 1)} \\leftarrow \\theta^{(\\tau)} + \\rho(\\tau)\\left( \\frac{1}{|B|} \\sum_{i \\in B} \\triangledown_{\\theta} L_i(\\theta) \\mid_{\\theta = \\theta^{(\\tau)}}\\right) $$"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}